Update readme and small correction in default params

Cyrilvallez · May 4, 2022 · 6bf8a6d · 6bf8a6d
1 parent 0002b68
commit 6bf8a6d
Show file tree

Hide file tree

Showing 7 changed files with 107 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,76 @@
-# Detection of image usage in social engineering on social media
+# Image manipulation detection
+
+A library for image manipulation detection. This supports 3 classes of algorithms :
+
+- Perceptual hashing methods (fast and simple methods designed for image forensics). The following algorithms are implemented in `hashing/imagehash.py`:
+    - Average Hash
+    - Perceptual hash
+    - Difference hash
+    - Wavelet hash
+    - Crop resistant hash
+    - Color hash
+    - Histogram hash
+
+
+- Features extractors and descriptors (designed for object/scene retievals). The following algorithms are supported in `hashing/featurehash.py` :
+    - SIFT
+    - ORB
+    - FAST + LATCH
+    - FAST + DAISY
+
+
+- Neural networks (deep CNNs) whose features from last layers have been shown to provide high descriptors of the image (regardless of the specific task the network was designed for, e.g classification). The following architectures are supported (note that each network was pretrained on ImageNet either for classification or by contrastive self-supervised learning) in `hashing/neuralhash.py`:
+    - inception v3 (classification)
+    - EfficientNet B7 (classification)
+    - ResNets with different depth and width multipliers (classification)
+    - SimCLR ResNets (contrastive learning). Link to [paper](https://arxiv.org/abs/2002.05709) and [github](https://github.com/google-research/simclr).
+
+The specific goal here is more to detect crude near duplicate image manipulations than to perform object or scene retrival.
+
+# Usage
+
+This library was created to benchmark all these different methods. The easiest way for this is to choose a dataset, randomly split it in 2 parts (experimental and control groups), and sample a given number of images in both groups on which you can perform artificial attacks defined is `generator/generate_attacks.py`. The scripts `create_groups.py` and `create_attacks.py` perform those tasks, and save the images with correct name format for later matching.
+
+Then given a database (all experimental group) of images to check for manipulations, an experimental group of images that are manipulations of some images in the database (all attacks on the images sampled from experimental group) and a control group containing images not present in the database (all attacks on the images sampled from control group), it can be declared as :
+
+```
+import hashing 
+from helpers import utils
+
+path_database = 'Datasets/BSDS500/Experimental/'
+path_experimental = 'Datasets/BSDS500/Experimental_attacks/'
+path_control = 'Datasets/BSDS500/Control_attacks/'
+
+positive_dataset = hashing.create_dataset(path_experimental, existing_attacks=True)
+negative_dataset = hashing.create_dataset(path_control, existing_attacks=True)
+```
+
+Then declare the methods and algorithms you wish to use, along with thresholds for the matching logic, e.g :
+
+```
+algos = [
+        hashing.ClassicalAlgorithm('Phash', hash_size=8),
+        hashing.FeatureAlgorithm('ORB', n_features=30),
+        hashing.NeuralAlgorithm('SimCLR v1 ResNet50 2x', device='cuda', distance='Jensen-Shannon')
+        ]
+
+thresholds = [
+    np.linspace(0, 0.4, 20),
+    np.linspace(0, 0.3, 20),
+    np.linspace(0.3, 0.8, 20),
+    ]
+```
+
+Finally perform the benchmark and save the results :
+
+```
+save_folder = utils.parse_input()
+
+digest = hashing.total_hashing(algos, thresholds, path_database, positive_dataset, negative_dataset, general_batch_size=64)
+                               
+utils.save_digest(digest, save_folder)
+```
+
+All this is contained in `main.py`.
+
+The final digest is composed of 6 files : `general.json` with general metrics for all the experiment, `attacks.json` containing the metrics for each types of attack, `images_pos.json` and `images_neg.json` containing number of correct/incorrect detection for each image in the database respectively, and `match_time.json` and `db_time.json` respectively containing the time (s) for the matching phase and the the database creation phase.
diff --git a/create_attacks.py b/create_attacks.py
@@ -45,7 +45,7 @@
                        in names_experimental[0:N]]
 save_control = [destination2 + name.split('.')[0] for name in names_control[0:N]]
 
-#generator.perform_all_and_save_list(images_experimental, save_name_list=save_experimental,
-#                             extension='PNG', **params)
+generator.perform_all_and_save_list(images_experimental, save_name_list=save_experimental,
+                            extension='PNG', **params)
 generator.perform_all_and_save_list(images_control, save_name_list=save_control,
                              extension='PNG', **params)
diff --git a/create_groups.py b/create_groups.py
@@ -33,36 +33,3 @@
 for file_name in group2:
     shutil.move(os.path.join(source_dir, file_name), target_dir2)
 
-#%%
-import numpy as np
-import shutil
-import os
-
-rng = np.random.default_rng(31)
-
-memes_path = 'Datasets/Kaggle_memes/Memes'
-templates_path = 'Datasets/Kaggle_memes/Templates'
-target_dir = 'Datasets/Kaggle_memes/Experimental'
-
-memes = [file.split('_', 1)[0] for file in os.listdir(memes_path)]
-templates = [file.split('.', 1)[0] for file in os.listdir(templates_path)]
-
-tot = 0
-bank = set(templates)
-
-N = len(memes)//2
-
-while tot < N-200:
-    template = rng.choice(list(bank), size=1, replace=False)[0]
-    bank.remove(template)
-    tot += len([a for a in os.listdir(memes_path) if a.split('_', 1)[0] == template])
-
-    for img in os.listdir(memes_path):
-
-        if img.split('_', 1)[0] == template:
-            shutil.move(os.path.join(memes_path, img), target_dir)
-
-#%%
-
-for file in os.listdir(memes_path):
-    shutil.move(os.path.join(memes_path, file), 'Datasets/Kaggle_memes/Control')
diff --git a/hashing/neuralhash.py b/hashing/neuralhash.py
@@ -609,7 +609,7 @@ class NeuralAlgorithm(Algorithm):
 
     """
 
-    def __init__(self, algorithm, hash_size=8, raw_features=False, distance='cosine',
+    def __init__(self, algorithm, hash_size=8, raw_features=True, distance='cosine',
                  batch_size=512, device='cuda'):
 
         super().__init__(algorithm, hash_size, batch_size)

diff --git a/process.py b/process.py
@@ -192,7 +192,32 @@
 #%%
 
 from PIL import Image
+import cv2
+import numpy as np
+
+original = 'Datasets/BSDS500/Control/data17.jpg'
+original = Image.open(original)
+original = np.array(original.convert('L'))
+
+copy = 'Datasets/BSDS500/Control_attacks/data17_rotation_60_and_rescaling.png'
+copy = Image.open(copy)
+copy = np.array(copy.convert('L'))
+
+sift = cv2.SIFT_create()
+
+kp1, des1 = sift.detectAndCompute(original, None)
+kp2, des2 = sift.detectAndCompute(copy, None)
+
+bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
+matches = bf.match(des1,des2)
+matches = sorted(matches, key = lambda x:x.distance)
+
+img3 = cv2.drawMatches(original,kp1,copy,kp2,matches[:30],None,flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS)
+
+out = Image.fromarray(img3)
+# out.save('test_feat.png')
+
+img = cv2.drawKeypoints(original, np.random.choice(kp1, 500, replace=False), None, flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
+out_keypoints = Image.fromarray(img)
+out_keypoints.save('keypoints.png')
 
-image = 'Datasets/BSDS500/Control/data17.jpg'
-image = Image.open(image)
-image = image.convert("L").resize((16, 16), Image.ANTIALIAS)
diff --git a/test_correct.pdf b/test_correct.pdf
diff --git a/test_incorrect.pdf b/test_incorrect.pdf