Add format type classification

Format type classification is added to the output so that when it is available reports will be automatically updated. Consistency has been improved in headings used in demystify as well.
exponential-decay · Mar 24, 2024 · bb20900 · bb20900
1 parent 6a2ce9f
commit bb20900
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 19 deletions.
diff --git a/src/demystify/i18n/internationalstrings.py b/src/demystify/i18n/internationalstrings.py
@@ -36,7 +36,7 @@ class AnalysisStringsEN:
     )
 
     SUMMARY_UNIQUE_EXTENSIONS = "Total Unique Extensions Across Accession/Extract"
-    SUMMARY_ZERO_BYTE = "Total Zero-byte Files in Accession/Extract"
+    SUMMARY_ZERO_BYTE = "Total Zero-byte Files in the Accession/Extract"
     SUMMARY_IDENTICAL_FILES = "Total Files with Identical Content (Checksum Value)"
     SUMMARY_PERCENTAGE_IDENTIFIED = "Percentage of Accession/Extract Identified"
     SUMMARY_PERCENTAGE_UNIDENTIFIED = "Percentage of Accession/Extract Unidentified"
@@ -123,9 +123,7 @@ class AnalysisStringsEN:
         "understand an accession/extract."
     )
 
-    HEADING_BINARY_ID = (
-        "Aggregated Frequency of File Format Signature Identifiers in Accession/Extract"
-    )
+    HEADING_BINARY_ID = "Aggregated Frequency of File Format Signature Identifiers in the Accession/Extract"
     HEADING_DESC_BINARY_ID = (
         "Listing of format identification, namespace, count, and match basis for files "
         "identified using file format signatures. Some tools can report what the basis "
@@ -135,7 +133,7 @@ class AnalysisStringsEN:
         + COUNT_TEXT
     )
 
-    HEADING_XML_ID = "Aggregated Frequency of XML Identifiers in Accession/Extract"
+    HEADING_XML_ID = "Aggregated Frequency of XML Identifiers in the Accession/Extract"
     HEADING_DESC_XML_ID = (
         "Listing of format identification, namespace, count, and match basis for files "
         "identified using XML based identification. Some tools can report what the basis "
@@ -145,7 +143,9 @@ class AnalysisStringsEN:
         + COUNT_TEXT
     )
 
-    HEADING_TEXT_ID = "Aggregated Frequency of Text Identifiers in Accession/Extract"
+    HEADING_TEXT_ID = (
+        "Aggregated Frequency of Text Identifiers in the Accession/Extract"
+    )
     HEADING_DESC_TEXT_ID = (
         "Listing of format identification, namespace, count, and match basis for files "
         "identified using Text based identification. Some tools can report what the basis "
@@ -156,7 +156,7 @@ class AnalysisStringsEN:
     )
 
     HEADING_FILENAME_ID = (
-        "Aggregated Frequency of Filename Identifiers in Accession/Extract"
+        "Aggregated Frequency of Filename Identifiers in the Accession/Extract"
     )
     HEADING_DESC_FILENAME_ID = (
         "Listing of format identification, namespace, count, and match basis for files "
@@ -234,7 +234,7 @@ class AnalysisStringsEN:
     )
 
     HEADING_AGGREGATE_BINARY_IDENTIFIED = (
-        "Aggregated File Format Signature Identifiers in Accession/Extract"
+        "Aggregated File Format Signature Identifiers in the Accession/Extract"
     )
     HEADING_DESC_IDENTIFIED = (
         "A list of ID values and format names to provide a clear picture of diversity/complexity of the "
@@ -248,20 +248,23 @@ class AnalysisStringsEN:
     HEADING_FREQUENCY_PUIDS_IDENTIFIED = (
         "Frequency of File Format Signature Identified IDs"
     )
+    HEADING_CLASSIFICATION = (
+        "Range of format type classifciation in the Accession/Extract"
+    )
     HEADING_DATE_RANGE = "Date Range of Items in the Accession/Extract"
     HEADING_EXTENSION_ONLY = "Extension Only Identification in the Accession/Extract"
     HEADING_ID_METHOD = "Identification Method Frequency"
     HEADING_FREQUENCY_EXTENSION_ONLY = (
-        "Frequency of Extension Only Identification In Accession/Extract"
+        "Frequency of Extension Only Identification in the Accession/Extract"
     )
     HEADING_UNIQUE_EXTENSIONS = (
         "Unique Extensions Identified Across All Objects (ID and non-ID)"
     )
     HEADING_LIST_MULTIPLE = "List of Files With Multiple Identifications"
     HEADING_FREQUENCY_EXTENSIONS_ALL = "Frequency of All Extensions"
     HEADING_FREQUENCY_MIME = "MIME Type (Internet Media Type) Frequency"
-    HEADING_LIST_ZERO_BYTES = "Zero-byte files in Accession/Extract"
-    HEADING_ARCHIVE_FORMATS = "Archive File Types in Accession/Extract"
+    HEADING_LIST_ZERO_BYTES = "Zero-byte files in the Accession/Extract"
+    HEADING_ARCHIVE_FORMATS = "Archive File Types in the Accession/Extract"
     HEADING_IDENTICAL_CONTENT = "Files With Identical Content (Checksum Value)"
     HEADING_TROUBLESOME_FILENAMES = "Identifying Non-ASCII and System File Names"
     HEADING_TROUBLESOME_DIRNAMES = "Identifying Non-ASCII and System Directory Names"
@@ -363,6 +366,12 @@ class AnalysisStringsEN:
         "is represented in the accession/extract, in a descending list from most frequent to least."
     )
 
+    HEADING_DESC_CLASSIFICATION = (
+        "Count and visialization giving a clear illustration of the distribution of file format type classifications "
+        "across the accession/extract. The list is sorted by count in descending order and provides a measure "
+        "of how homogenous a collection is which is potentially indicative of its complexity to preserve."
+    )
+
     HEADING_DESC_DATE_RANGE = (
         "Count and visualization giving a clear illustration of the distribution of file modification dates across the "
         "accession/extract. The list is in descending order based on the number of files last modified on any given year. "
@@ -403,7 +412,7 @@ class AnalysisStringsEN:
     )
 
     HEADING_DESC_FREQUENCY_EXTENSIONS_ALL = (
-        "Lists the gamut of file extensions alongside how many times they appear in accession/extract "
+        "Lists the gamut of file extensions alongside how many times they appear in the accession/extract "
         "in descending order. This information can be used to identify the diversity/complexity of the "
         "accession/extract, but also to identify the consistency with which extensions may have been "
         "used in the accession/extract and may indicate how much work may be needed to correct "
@@ -454,10 +463,12 @@ class AnalysisStringsEN:
         "<a href='https://msdn.microsoft.com/en-nz/library/windows/desktop/aa365247%28v=vs.85%29.aspx?f=255&MSPPError=-2147217396'>Microsoft</a>"
     )
 
-    HEADING_DENYLIST_IDS = "Denylist Identifiers found in Accession/Extract"
-    HEADING_DENYLIST_EXTS = "Denylist Filename Extensions found in Accession/Extract"
-    HEADING_DENYLIST_DIRS = "Denylist Directory Names found in Accession/Extract"
-    HEADING_DENYLIST_FILENAMES = "Denylist Filenames found in Accession/Extract"
+    HEADING_DENYLIST_IDS = "Denylist Identifiers found in the Accession/Extract"
+    HEADING_DENYLIST_EXTS = (
+        "Denylist Filename Extensions found in the Accession/Extract"
+    )
+    HEADING_DENYLIST_DIRS = "Denylist Directory Names found in the Accession/Extract"
+    HEADING_DENYLIST_FILENAMES = "Denylist Filenames found in the Accession/Extract"
     HEADING_DESC_DENYLIST = """Lists objects which are considered undesirable inside an accession/extract.
                                The denylist is entirely configurable by user and may contain files that can be
                                identified as undesirable, e.g. system files; or file names that may need pre-conditioning
@@ -466,15 +477,15 @@ class AnalysisStringsEN:
                                strings using a wildcard search. IDs e.g. PUID identifiers, and filename extension searches
                                are understandably more precise."""
 
-    TEXT_ONLY_FIVE_TOP_PUIDS = "Five Top PUIDs in Accession/Extract"
-    TEXT_ONLY_FIVE_TOP_EXTENSIONS = "Five Top Extensions in Accession/Extract"
+    TEXT_ONLY_FIVE_TOP_PUIDS = "Five Top PUIDs in the Accession/Extract"
+    TEXT_ONLY_FIVE_TOP_EXTENSIONS = "Five Top Extensions in the Accession/Extract"
 
     COLUMN_HEADER_VALUES_NAMESPACE = "Namespace"
     COLUMN_HEADER_VALUES_ID = "ID"
     COLUMN_HEADER_VALUES_FORMAT = "Format Name"
     COLUMN_HEADER_VALUES_COUNT = "Count"
     COLUMN_HEADER_VALUES_YEAR = "Year"
-    COLUMN_HEADER_VALUES_YEAR = "Volume"
+    COLUMN_HEADER_VALUES_CLASSIFICATION = "Classification"
 
     FNAME_CHECK_ASCII = "contains, characters outside of ASCII range"
     FNAME_CHECK_PERIOD = "has a period '.' as its last character"

diff --git a/src/demystify/libs/AnalysisQueriesClass.py b/src/demystify/libs/AnalysisQueriesClass.py
@@ -103,6 +103,27 @@ def methods_return_ns_sort(self, ns_id):
         "AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')"
     )
 
+    SELECT_CLASSIFICATION_COUNT = (
+        "SELECT COUNT(IDDATA.classification)\n"
+        "FROM IDRESULTS\n"
+        "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n"
+        "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n"
+        "WHERE (NSDATA.NS_NAME='pronom')\n"
+        "AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')"
+        "AND IDDATA.classification != 'None'"
+    )
+
+    SELECT_CLASSIFICATION_FREQUENCY = (
+        "SELECT IDDATA.classification,\n"
+        "COUNT(*) as TOTAL\n"
+        "FROM IDRESULTS\n"
+        "JOIN NSDATA on IDDATA.NS_ID = NSDATA.NS_ID\n"
+        "JOIN IDDATA on IDRESULTS.ID_ID = IDDATA.ID_ID\n"
+        "WHERE (NSDATA.NS_NAME='pronom')\n"
+        "AND (IDDATA.METHOD='Signature' OR IDDATA.METHOD='Container')"
+        "GROUP BY IDDATA.classification ORDER BY TOTAL DESC"
+    )
+
     # PRONOM and OTHERS Text identifiers as one result
     # PRONOM and OTHERS Text identifiers as one result
     @staticmethod

diff --git a/src/demystify/libs/AnalysisResultsClass.py b/src/demystify/libs/AnalysisResultsClass.py
@@ -96,6 +96,9 @@ def __init__(self):
         self.badFileNames = None
         self.badDirNames = None
 
+        self.classifications_count = 0
+        self.classifications = []
+
         # Hash related values.
         self.hashused = False
         self.duplicateHASHlisting = None

diff --git a/src/demystify/libs/DemystifyAnalysisClass.py b/src/demystify/libs/DemystifyAnalysisClass.py
@@ -979,6 +979,18 @@ def queryDB(self):
             if self.rogueanalysis:
                 self._handle_rogue_analysis()
 
+            self.analysis_results.classifications_count = int(
+                self._querydb(
+                    AnalysisQueries.SELECT_CLASSIFICATION_COUNT,
+                    True,
+                    True,
+                )
+            )
+
+            self.analysis_results.classifications = self._querydb(
+                AnalysisQueries.SELECT_CLASSIFICATION_FREQUENCY
+            )
+
         return self.analysis_results
 
     def _handle_rogue_analysis(self):

diff --git a/src/demystify/libs/outputhandlers/htmloutputclass.py b/src/demystify/libs/outputhandlers/htmloutputclass.py
@@ -768,6 +768,49 @@ def generateHTML(self):
                 )
             self.identifierchart(countlist)
 
+        #######################################################################
+        ### CLASSIFICATION
+        #######################################################################
+
+        if self.analysis_results.classifications_count > 0:
+            self._outputheading(
+                self.STRINGS.HEADING_CLASSIFICATION,
+                self.STRINGS.HEADING_DESC_CLASSIFICATION,
+            )
+            self.printFormattedText("<table>")
+            self.printFormattedText(
+                '<table><th style="text-align: left;">{}</a></th><th style="text-align: left;">{}</th>'.format(
+                    self.STRINGS.COLUMN_HEADER_VALUES_CLASSIFICATION,
+                    self.STRINGS.COLUMN_HEADER_VALUES_COUNT,
+                )
+            )
+
+            for format_classification in self.analysis_results.classifications:
+                classification = format_classification[0]
+                if classification.lower() == "none":
+                    classification = "No format type classification"
+                self.printFormattedText('<tr><td style="width: 300px;">')
+                self.printFormattedText(f"{classification}")
+                self.printFormattedText(
+                    "</td><td>{}</td>".format(format_classification[1])
+                )
+
+                # Unused Meter Code...
+                self.printFormattedText(
+                    self._outputmeter(
+                        format_classification[1], 0, self.analysis_results.filecount
+                    )
+                )
+                self.printFormattedText("</tr>")
+
+            self.printFormattedText("</table>")
+            self._htmlnewline()
+            self.printFormattedText("<hr/>")
+
+        #######################################################################
+        ### DATE RANGE
+        #######################################################################
+
         if self.analysis_results.dateFrequency is not None:
             # Date Ranges
             self._outputheading(