diff --git a/src/4.Classification_RF_species-invariant.ipynb b/src/4.Classification_RF_species-invariant.ipynb index 8ff8ca2..ce0fc39 100644 --- a/src/4.Classification_RF_species-invariant.ipynb +++ b/src/4.Classification_RF_species-invariant.ipynb @@ -17,7 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "from data import DATA_MC3D, DATA_MP\n", + "from data import DATA_3DCD, DATA_MP\n", "import pickle\n", "from IPython.display import clear_output\n", "from sklearn.model_selection import learning_curve" @@ -42,12 +42,11 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "75a4626d", "metadata": {}, "source": [ - "# MC3D data analysis" + "3DCD data analysis" ] }, { @@ -57,10 +56,10 @@ "metadata": {}, "outputs": [], "source": [ - "npzfile_MC3D = np.load(DATA_MC3D.soap, allow_pickle=True)\n", - "my_frames_MC3D = ase.io.read(DATA_MC3D.structures, index=\":\")\n", - "magic_MC3D = np.array(npzfile_MC3D[\"magic\"], dtype=int).reshape(-1, 1)\n", - "y_magic_MC3D = np.copy(magic_MC3D.reshape(-1, 1).ravel())" + "npzfile_3dcd = np.load(DATA_3DCD.soap, allow_pickle=True)\n", + "my_frames_3dcd = ase.io.read(DATA_3DCD.structures, index=\":\")\n", + "magic_3dcd = np.array(npzfile_3dcd[\"magic\"], dtype=int).reshape(-1,1)\n", + "y_magic_3dcd = np.copy(magic_3dcd.reshape(-1, 1).ravel())" ] }, { @@ -72,7 +71,7 @@ "source": [ "npzfile_mp = np.load(DATA_MP.soap, allow_pickle=True)\n", "my_frames_mp = ase.io.read(DATA_MP.structures, index=\":\")\n", - "magic_mp = np.array(npzfile_mp[\"magic\"], dtype=int).reshape(-1, 1)\n", + "magic_mp = np.array(npzfile_mp[\"magic\"], dtype=int).reshape(-1,1)\n", "y_magic_mp = np.copy(magic_mp.reshape(-1, 1).ravel())" ] }, @@ -83,9 +82,9 @@ "metadata": {}, "outputs": [], "source": [ - "my_frames = np.array([*my_frames_MC3D, *my_frames_mp])\n", - "my_orig_frames = np.array([*my_frames_MC3D, *my_frames_mp])\n", - "del my_frames_mp, my_frames_MC3D" + "my_frames = np.array([*my_frames_3dcd, *my_frames_mp], dtype=object)\n", + "my_orig_frames = np.array([*my_frames_3dcd, *my_frames_mp], dtype=object)\n", + "del my_frames_mp, my_frames_3dcd" ] }, { @@ -95,8 +94,8 @@ "metadata": {}, "outputs": [], "source": [ - "y_magic = np.hstack((y_magic_MC3D, y_magic_mp))\n", - "del magic_mp, magic_MC3D, y_magic_MC3D, y_magic_mp" + "y_magic = np.hstack((y_magic_3dcd, y_magic_mp))\n", + "del magic_mp, magic_3dcd, y_magic_3dcd, y_magic_mp" ] }, { @@ -106,10 +105,11 @@ "metadata": {}, "outputs": [], "source": [ + "from tqdm.auto import tqdm\n", "for frame in tqdm(my_frames):\n", " frame.numbers = np.zeros((len(frame)))\n", " frame.pbc = True\n", - " frame.wrap(eps=1e-10)" + " frame.wrap(eps=1E-10)" ] }, { @@ -139,16 +139,16 @@ "metadata": {}, "outputs": [], "source": [ - "if os.path.exists(\"train_indices_all.npy\"):\n", - " i_train, i_test = np.load(\"train_indices_all.npy\"), np.load(\"test_indices_all.npy\")\n", + "if os.path.exists('train_indices_all.npy'):\n", + " i_train, i_test = np.load('train_indices_all.npy'), np.load('test_indices_all.npy')\n", " y_train, y_test = y_magic[i_train], y_magic[i_test]\n", "else:\n", - " print(\"generating\")\n", + " print('generating')\n", " i_train, i_test, y_train, y_test = train_test_split(\n", " np.arange(X.shape[0]), y_magic, train_size=0.9\n", " )\n", - " np.save(\"train_indices_all.npy\", i_train)\n", - " np.save(\"test_indices_all.npy\", i_test)" + " np.save('train_indices_all.npy', i_train)\n", + " np.save('test_indices_all.npy', i_test)" ] }, { @@ -166,13 +166,11 @@ "TP_test_all = np.zeros(len(rads), dtype=int)\n", "\n", "for i, r in enumerate(rads):\n", - " if not os.path.exists(\"soaps_{}.npy\".format(r)):\n", - " representation = SOAP(interaction_cutoff=r, **hypers)\n", - " n_soap = (\n", - " representation.transform(my_frames[0]).get_features(representation).shape[1]\n", - " )\n", + " if not os.path.exists('soaps_{}.npy'.format(r)):\n", + " representation=SOAP(interaction_cutoff=r, **hypers)\n", + " n_soap = representation.transform(my_frames[0]).get_features(representation).shape[1]\n", "\n", - " fn = \"soaps_{}.npy\".format(r)\n", + " fn = 'soaps_{}.npy'.format(r)\n", " blocks = np.array_split(np.arange(len(my_frames)), 500)\n", " if os.path.exists(fn):\n", " X_raw = np.load(fn)\n", @@ -182,35 +180,27 @@ " frames = my_frames[block]\n", " for frame in frames:\n", " frame.pbc = True\n", - " frame.wrap(eps=1e-10)\n", + " frame.wrap(eps=1E-10)\n", " centers = [len(f) for f in frames]\n", " csi_split = [0, *np.cumsum(centers)]\n", - " idxi = [\n", - " range(csi_split[ii], csi_split[ii + 1]) for ii in range(len(block))\n", - " ]\n", + " idxi = [range(csi_split[ii], csi_split[ii + 1]) for ii in range(len(block))]\n", "\n", " try:\n", - " feats = representation.transform(frames).get_features(\n", - " representation\n", - " )\n", + " feats = representation.transform(frames).get_features(representation)\n", " for ii, idx in zip(block, idxi):\n", " X_raw[ii] = feats[idx].mean(axis=0)\n", " except RuntimeError:\n", " for ii, f in zip(block, frames):\n", " try:\n", - " X_raw[ii] = (\n", - " representation.transform(f)\n", - " .get_features(representation)\n", - " .mean(axis=0)\n", - " )\n", + " X_raw[ii] = representation.transform(f).get_features(representation).mean(axis=0)\n", " except RuntimeError:\n", " print(\"Problem with\", ii)\n", - "\n", + " \n", " np.save(fn, X_raw)\n", - " X = np.load(\"soaps_{}.npy\".format(r))\n", + " X = np.load('soaps_{}.npy'.format(r))\n", " X_train, X_test = X[i_train], X[i_test]\n", " if os.path.exists(\"x_scaler_blanked_{}.sav\".format(r)):\n", - " x_scaler = pickle.load(open(\"x_scaler_blanked_{}.sav\".format(r), \"rb\"))\n", + " x_scaler = pickle.load(open(\"x_scaler_blanked_{}.sav\".format(r), 'rb'))\n", " else:\n", " x_scaler = StandardFlexibleScaler(column_wise=False).fit(X)\n", " pickle.dump(x_scaler, open(\"x_scaler_blanked_{}.sav\".format(r), \"wb\"))\n", @@ -218,31 +208,35 @@ " X = x_scaler.transform(X)\n", " X_train = x_scaler.transform(X_train)\n", " X_test = x_scaler.transform(X_test)\n", - "\n", - " if os.path.exists(\"random_forest_all_blanked_{}.sav\".format(r)):\n", - " clf = pickle.load(open(\"random_forest_all_blanked_{}.sav\".format(r), \"rb\"))\n", + " \n", + " if os.path.exists('random_forest_all_blanked_{}.sav'.format(r)):\n", + " clf = pickle.load(open('random_forest_all_blanked_{}.sav'.format(r), 'rb'))\n", " else:\n", - " clf = RandomForestClassifier(\n", - " verbose=2, n_estimators=100, random_state=2, n_jobs=4\n", - " )\n", + " clf = RandomForestClassifier(verbose=2, n_estimators=100, random_state=2, n_jobs=4)\n", " clf.fit(X_train, y_train.ravel())\n", - " pickle.dump(clf, open(\"random_forest_all_blanked_{}.sav\".format(r), \"wb\"))\n", + " pickle.dump(clf, open('random_forest_all_blanked_{}.sav'.format(r), 'wb'))\n", " errors[i] = clf.score(X_test, y_test.ravel())\n", " prob = clf.predict_proba(X)\n", - " TP_test_all[i] = len(\n", - " np.where(np.logical_and(y_test == 1, prob[i_test][:, 1] > 0.5))[0]\n", - " )\n", - " TN_test_all[i] = len(\n", - " np.where(np.logical_and(y_test == 0, prob[i_test][:, 0] > 0.5))[0]\n", - " )\n", - " FP_test_all[i] = len(\n", - " np.where(np.logical_and(y_test == 0, prob[i_test][:, 1] > 0.5))[0]\n", - " )\n", - " FN_test_all[i] = len(\n", - " np.where(np.logical_and(y_test == 1, prob[i_test][:, 0] > 0.5))[0]\n", - " )\n", + " TP_test_all[i] = len(np.where(np.logical_and(y_test==1, prob[i_test][:, 1] > 0.5))[0])\n", + " TN_test_all[i] = len(np.where(np.logical_and(y_test==0, prob[i_test][:, 0] > 0.5))[0])\n", + " FP_test_all[i] = len(np.where(np.logical_and(y_test==0, prob[i_test][:, 1] > 0.5))[0])\n", + " FN_test_all[i] = len(np.where(np.logical_and(y_test==1, prob[i_test][:, 0] > 0.5))[0])\n", " clear_output()\n", - "plt.plot(rads, errors, marker=\"o\")" + "plt.plot(rads, errors, marker='o')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc97a067-4b74-4f35-9b21-f9ce0509843a", + "metadata": {}, + "outputs": [], + "source": [ + "precision = TP_test_all / (TP_test_all + FP_test_all)\n", + "recall = TP_test_all / (TP_test_all + FN_test_all)\n", + "plt.plot(rads, errors, marker='o', c='b')\n", + "plt.plot(rads, precision, marker='o', c='g')\n", + "plt.plot(rads, recall, marker='o', c='orange')" ] }, { @@ -255,18 +249,18 @@ "from sklearn.decomposition import PCA\n", "\n", "r = 4.0\n", - "X_raw = np.load(\"soaps_{}.npy\".format(r))\n", + "X_raw = np.load('soaps_{}.npy'.format(r))\n", "X_train, X_test = X[i_train], X_raw[i_test]\n", - "x_scaler = pickle.load(open(\"x_scaler_blanked_{}.sav\".format(r), \"rb\"))\n", + "x_scaler = pickle.load(open(\"x_scaler_blanked_{}.sav\".format(r), 'rb'))\n", "X = x_scaler.transform(X_raw)\n", "X_train = x_scaler.transform(X_train)\n", "X_test = x_scaler.transform(X_test)\n", - "clf = pickle.load(open(\"random_forest_all_blanked_{}.sav\".format(r), \"rb\"))\n", + "clf = pickle.load(open('random_forest_all_blanked5_{}.sav'.format(r), 'rb'))\n", "\n", - "TP_test = np.where(np.logical_and(y_test == 1, prob[i_test][:, 1] > 0.5))[0]\n", - "TN_test = np.where(np.logical_and(y_test == 0, prob[i_test][:, 0] > 0.5))[0]\n", - "FP_test = np.where(np.logical_and(y_test == 0, prob[i_test][:, 1] > 0.5))[0]\n", - "FN_test = np.where(np.logical_and(y_test == 1, prob[i_test][:, 0] > 0.5))[0]" + "TP_test = np.where(np.logical_and(y_test==1, prob[i_test][:, 1] > 0.5))[0]\n", + "TN_test = np.where(np.logical_and(y_test==0, prob[i_test][:, 0] > 0.5))[0]\n", + "FP_test = np.where(np.logical_and(y_test==0, prob[i_test][:, 1] > 0.5))[0]\n", + "FN_test = np.where(np.logical_and(y_test==1, prob[i_test][:, 0] > 0.5))[0]" ] }, { @@ -286,38 +280,28 @@ "metadata": {}, "outputs": [], "source": [ - "if not os.path.exists(\"lc_{}.npz\".format(r)):\n", - " temp_rf = RandomForestClassifier(\n", - " verbose=False, n_estimators=100, random_state=2, n_jobs=4\n", - " )\n", - " lc = learning_curve(\n", - " temp_rf,\n", - " X,\n", - " y_magic,\n", - " verbose=2,\n", - " train_sizes=np.array(\n", - " X.shape[0] * np.logspace(-3, np.log10(0.8), 30), dtype=int\n", - " ),\n", - " cv=5,\n", - " )\n", - " np.savez(\"lc_{}.npz\".format(r), n=lc[0], train=lc[1], test=lc[2])\n", + "if not os.path.exists('lc_{}.npz'.format(r)):\n", + " temp_rf = RandomForestClassifier(verbose=False, n_estimators=100, random_state=2, n_jobs=4)\n", + " lc = learning_curve(temp_rf, X, y_magic, verbose=2, \n", + " train_sizes=np.array(X.shape[0] * np.logspace(-3, np.log10(0.8), 30), dtype=int), \n", + " cv=5)\n", + " np.savez('lc_{}.npz'.format(r), n=lc[0], train=lc[1], test=lc[2])\n", "\n", - "lc = np.load(\"lc_{}.npz\".format(r))\n", - "n = lc[\"n\"]\n", - "train_errors = lc[\"train\"]\n", - "test_errors = lc[\"test\"]\n", + "lc = np.load('lc_{}.npz'.format(r))\n", + "n = lc['n']\n", + "train_errors = lc['train']\n", + "test_errors = lc['test']\n", "\n", "error = 1.0 - test_errors\n", - "plt.loglog(n, np.mean(error, axis=1), marker=\"o\", color=\"k\")\n", + "plt.loglog(n, np.mean(error, axis=1), marker='o', color='k')\n", "\n", - "for a in [1, 2]:\n", - " plt.fill_between(\n", - " n,\n", - " np.mean(error, axis=1) - a * np.std(error, axis=1),\n", - " np.mean(error, axis=1) + a * np.std(error, axis=1),\n", - " alpha=0.2,\n", - " color=\"r\",\n", - " )" + "for a in [1,2]:\n", + " plt.fill_between(n, \n", + " np.mean(error, axis=1) - a*np.std(error, axis=1),\n", + " np.mean(error, axis=1) + a*np.std(error, axis=1),\n", + " alpha=0.2,\n", + " color='r'\n", + " )" ] }, { @@ -338,44 +322,32 @@ "outputs": [], "source": [ "from mpl_toolkits.axes_grid1.inset_locator import inset_axes\n", - "\n", - "fig, axes = plt.subplots(2, 1, figsize=(6, 6), gridspec_kw=dict(height_ratios=(2, 1)))\n", + "fig, axes = plt.subplots(2, 1, figsize=(6, 6), gridspec_kw=dict(height_ratios=(2,1)))\n", "\n", "ax_rad = axes[0]\n", "ax_tab = axes[1]\n", - "ax_lc = inset_axes(ax_rad, width=\"40%\", height=\"40%\", loc=\"center right\", borderpad=3)\n", + "ax_lc = inset_axes(ax_rad, width='40%', height='40%', loc='center right', borderpad=3)\n", "\n", - "ax_rad.plot(rads, errors, marker=\"o\")\n", + "ax_rad.plot(rads, errors, marker='o')\n", "\n", "sub_rads = [1, 2, 3, 4, 5, 8]\n", "sub_i = np.array([i for i, r in enumerate(rads) if r in sub_rads])\n", - "tab = ax_tab.table(\n", - " [\n", - " np.array(rads)[sub_i],\n", - " TP_test_all[sub_i],\n", - " TN_test_all[sub_i],\n", - " FP_test_all[sub_i],\n", - " FN_test_all[sub_i],\n", - " ],\n", - " rowLabels=[r\"Radius [$\\AA$]\", \"TP\", \"TN\", \"FP\", \"FN\"],\n", - " cellLoc=\"center\",\n", - " rowLoc=\"center\",\n", - " bbox=(0.0, 0, 1, 1),\n", - " loc=\"center right\",\n", - ")\n", + "tab = ax_tab.table([np.array(rads)[sub_i], TP_test_all[sub_i], TN_test_all[sub_i], FP_test_all[sub_i], FN_test_all[sub_i]], \n", + " rowLabels=[r'Radius [$\\AA$]','TP', 'TN','FP','FN'], \n", + " cellLoc='center',\n", + " rowLoc='center',\n", + " bbox=(0.0,0,1,1), loc='center right')\n", "ax_rad.set_xlabel(\"Radius [$\\AA$]\")\n", - "ax_rad.set_ylabel(\"R$^2$\")\n", + "ax_rad.set_ylabel(\"Accuracy\")\n", "ax_rad.set_title(\"Classification Accuracy with Increasing $r_\\mathrm{cut}$\")\n", - "ax_tab.axis(\"off\")\n", + "ax_tab.axis('off')\n", "\n", - "error = 1.0 - test_errors\n", - "ax_lc.errorbar(\n", - " n, np.mean(error, axis=1), np.std(error, axis=1), marker=\".\", capsize=2, color=\"k\"\n", - ")\n", - "ax_lc.set_xscale(\"log\")\n", + "error = test_errors\n", + "ax_lc.errorbar(n, np.mean(error, axis=1), np.std(error, axis=1), marker='.', capsize=2, color='k')\n", + "ax_lc.set_xscale('log')\n", "\n", - "ax_lc.set_ylabel(r\"1-R$^2$\")\n", - "ax_lc.set_xlabel(r\"$n_\\mathrm{train}$\")\n", + "ax_lc.set_ylabel(r'Accuracy')\n", + "ax_lc.set_xlabel(r'$n_\\mathrm{train}$')\n", "ax_lc.set_title(\"Learning Curve for $r_\\mathrm{cut} = 4.0\\AA$\")\n", "fig.subplots_adjust(hspace=0.5)\n", "\n", @@ -389,12 +361,50 @@ "id": "01a5ba0f", "metadata": {}, "outputs": [], + "source": [ + "from mpl_toolkits.axes_grid1.inset_locator import inset_axes\n", + "fig, ax_rad = plt.subplots(1, 1, figsize=(6, 4),)\n", + "\n", + "ax_lc = inset_axes(ax_rad, width='30%', height='30%', loc='lower right', borderpad=4)\n", + "\n", + "ax_rad.plot(rads, errors, marker='o', label='Accuracy')\n", + "ax_rad.plot(rads, precision, marker='o', color='g', label='Precision')\n", + "ax_rad.plot(rads, recall, marker='o', color='purple', label='Recall')\n", + "\n", + "sub_rads = [1, 2, 3, 4, 5, 8]\n", + "sub_i = np.array([i for i, r in enumerate(rads) if r in sub_rads])\n", + "\n", + "ax_rad.set_xlabel(\"Radius [$\\AA$]\")\n", + "ax_rad.set_ylabel(\"Accuracy\")\n", + "ax_rad.set_title(\"Classification Accuracy with Increasing $r_\\mathrm{cut}$\")\n", + "ax_tab.axis('off')\n", + "\n", + "error = test_errors\n", + "ax_lc.errorbar(n, np.mean(error, axis=1), np.std(error, axis=1), marker='.', capsize=2, color='k')\n", + "ax_lc.set_xscale('log')\n", + "\n", + "ax_lc.set_ylabel(r'Accuracy')\n", + "ax_lc.set_xlabel(r'$n_\\mathrm{train}$')\n", + "ax_lc.set_title(\"Learning Curve for $r_\\mathrm{cut} = 4.0\\AA$\")\n", + "fig.subplots_adjust(hspace=0.5)\n", + "\n", + "fig.tight_layout()\n", + "ax_rad.legend(loc='center left')\n", + "plt.savefig(\"random_forest_new.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b349e851-e781-4200-85e9-2b932f6b4e13", + "metadata": {}, + "outputs": [], "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -408,7 +418,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.10.13" } }, "nbformat": 4,