bibilov · DantesFonCake · Dec 24, 2021
diff --git a/XmlReader.py b/XmlReader.py
@@ -0,0 +1,50 @@
+import sys
+import xml.etree.ElementTree as ET
+from xml.etree.ElementTree import ParseError
+
+
+class ReadByChunk:
+    def __init__(self, file, tag="cv", read_chars=1024 * 1024):
+        self.read_chars = read_chars
+        self.tag = tag
+        self._buf = ""
+        self.file = open(file, encoding="utf8")
+
+    def _find_string(self, s, is_opening=True):
+        pos = self._buf.find(s)
+        while pos == -1:
+            additional_buf = self.file.read(self.read_chars)
+            if additional_buf == '':
+                return -1
+            if is_opening and self._buf != '':
+                for i in range(1, len(s)):
+                    buf_end = self._buf[-i:]
+                    end_partial = buf_end + additional_buf[:(len(s) - i)]
+                    if end_partial == s:
+                        self._buf = buf_end + additional_buf
+                        return 0
+            self._buf += additional_buf
+            pos = self._buf.find(s)
+        return pos
+
+    def tags(self):
+        open_string = f'<{self.tag} '
+        end_string = f'</{self.tag}>'
+        while True:
+            pos = self._find_string(open_string)
+            if pos == -1:
+                raise StopIteration
+            if pos != 0:
+                self._buf = self._buf[pos:]
+
+            end_pos = self._find_string(end_string, False)
+            if end_pos == -1:
+                raise StopIteration
+
+            xml_tag = self._buf[:end_pos + len(f'</{self.tag}>')]
+            self._buf = self._buf[end_pos + len(f'</{self.tag}>'):]
+            try:
+                root = ET.fromstring(xml_tag)
+                yield root
+            except ParseError as e:
+                pass
diff --git a/main.py b/main.py
@@ -0,0 +1,101 @@
+import os
+from collections import defaultdict
+
+from nltk import PorterStemmer
+from tqdm import tqdm
+import pandas as pd
+import nltk
+
+import XmlReader
+import csv
+
+ps = PorterStemmer()
+
+
+def normalize(s, not_allowed_parts=()):
+    words = []
+    for sent in filter(lambda x: not str.isspace(x), s.split('.')):
+        words += list(map(ps.stem, map(str.strip, nltk.word_tokenize(sent, language='russian'))))
+
+    tagged = nltk.pos_tag(words, lang='rus')
+    return ', '.join(sorted(map(lambda x: x[0], filter(lambda x: x[0] != '' and not str.isspace(x[0]) and
+                                                       x[1] not in not_allowed_parts,
+                                                       tagged))))
+
+
+def main():
+    if not os.path.isfile('works.csv'):
+        prepare_csv()
+    nltk.download('punkt')
+    nltk.download('averaged_perceptron_tagger_ru')
+    data = pd.read_csv('works.csv').dropna()
+
+    not_allowed_parts = ['ADP', 'CONJ', 'DET', 'PRT', 'PRON', '.', 'X']
+    data.jobTitle = data.jobTitle.map(lambda x: normalize(x, not_allowed_parts), na_action='ignore')
+    data.qualification = data.qualification.map(lambda x: normalize(x, not_allowed_parts), na_action='ignore')
+
+    not_matching = 0
+    for (title, qualif) in zip(data.jobTitle, data.qualification):
+        if title == '' or qualif == '' or str.isspace(title) or str.isspace(
+                qualif):
+            continue
+        if title not in qualif and qualif not in title:
+            not_matching += 1
+
+    print(f'Не совпадающие профессии: {not_matching}') #989
+
+    print(data[data.jobTitle.str.contains(
+            'менеджер')].qualification)
+    top_managers = data[data.jobTitle.str.contains(
+            'менеджер')].qualification.value_counts().head(5)
+
+    top_engineers = data[data.jobTitle.str.contains(
+            'инженер')].qualification.value_counts().head(5)
+
+    print('Топ 5 менеджеров:')
+    print(top_managers)
+    # бакалавр              11
+    # менеджер              10
+    # специалист             6
+    # экономист              6
+    # экономист-менеджер     4
+    print('Топ 5 инженеров:')
+    print(top_engineers)
+    # инженер             18
+    # бакалавр             4
+    # инженер-механик      3
+    # инженер-электрик     2
+    # менеджер             2
+
+def prepare_csv():
+    from random import randint
+    rename = {
+            'salary':                                        'salary',
+            'educationType':                                 'educationType',
+            'workExperienceList/workExperience[1]/jobTitle': 'jobTitle',
+            "educationList/educationType[1]/qualification":  'qualification',
+            'gender':                                        'gender',
+            'innerInfo/dateModify':                          'dateModify',
+            "skills":                                        "skills",
+            "otherInfo":                                     "otherInfo"
+            }
+    reader = XmlReader.ReadByChunk("big_xml.xml")
+    with open('works.csv', 'w', newline='') as csvfile:
+        fieldnames = list(rename.values())
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+
+        for tag in tqdm(reader.tags()):
+            props = defaultdict(str)
+            for k, v in rename.items():
+                actual_tag = tag.find(k)
+                if actual_tag is None:
+                    continue
+                props[v] = actual_tag.text
+
+            if randint(0, 100) == 0:
+                writer.writerow(props)
+
+
+if __name__ == "__main__":
+    main()