Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Арипов Сергей АТ-13 #3

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions XmlReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import sys
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import ParseError


class ReadByChunk:
def __init__(self, file, tag="cv", read_chars=1024 * 1024):
self.read_chars = read_chars
self.tag = tag
self._buf = ""
self.file = open(file, encoding="utf8")

def _find_string(self, s, is_opening=True):
pos = self._buf.find(s)
while pos == -1:
additional_buf = self.file.read(self.read_chars)
if additional_buf == '':
return -1
if is_opening and self._buf != '':
for i in range(1, len(s)):
buf_end = self._buf[-i:]
end_partial = buf_end + additional_buf[:(len(s) - i)]
if end_partial == s:
self._buf = buf_end + additional_buf
return 0
self._buf += additional_buf
pos = self._buf.find(s)
return pos

def tags(self):
open_string = f'<{self.tag} '
end_string = f'</{self.tag}>'
while True:
pos = self._find_string(open_string)
if pos == -1:
raise StopIteration
if pos != 0:
self._buf = self._buf[pos:]

end_pos = self._find_string(end_string, False)
if end_pos == -1:
raise StopIteration

xml_tag = self._buf[:end_pos + len(f'</{self.tag}>')]
self._buf = self._buf[end_pos + len(f'</{self.tag}>'):]
try:
root = ET.fromstring(xml_tag)
yield root
except ParseError as e:
pass
101 changes: 101 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import os
from collections import defaultdict

from nltk import PorterStemmer
from tqdm import tqdm
import pandas as pd
import nltk

import XmlReader
import csv

ps = PorterStemmer()


def normalize(s, not_allowed_parts=()):
words = []
for sent in filter(lambda x: not str.isspace(x), s.split('.')):
words += list(map(ps.stem, map(str.strip, nltk.word_tokenize(sent, language='russian'))))

tagged = nltk.pos_tag(words, lang='rus')
return ', '.join(sorted(map(lambda x: x[0], filter(lambda x: x[0] != '' and not str.isspace(x[0]) and
x[1] not in not_allowed_parts,
tagged))))


def main():
if not os.path.isfile('works.csv'):
prepare_csv()
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_ru')
data = pd.read_csv('works.csv').dropna()

not_allowed_parts = ['ADP', 'CONJ', 'DET', 'PRT', 'PRON', '.', 'X']
data.jobTitle = data.jobTitle.map(lambda x: normalize(x, not_allowed_parts), na_action='ignore')
data.qualification = data.qualification.map(lambda x: normalize(x, not_allowed_parts), na_action='ignore')

not_matching = 0
for (title, qualif) in zip(data.jobTitle, data.qualification):
if title == '' or qualif == '' or str.isspace(title) or str.isspace(
qualif):
continue
if title not in qualif and qualif not in title:
not_matching += 1

print(f'Не совпадающие профессии: {not_matching}') #989

print(data[data.jobTitle.str.contains(
'менеджер')].qualification)
top_managers = data[data.jobTitle.str.contains(
'менеджер')].qualification.value_counts().head(5)

top_engineers = data[data.jobTitle.str.contains(
'инженер')].qualification.value_counts().head(5)

print('Топ 5 менеджеров:')
print(top_managers)
# бакалавр 11
# менеджер 10
# специалист 6
# экономист 6
# экономист-менеджер 4
print('Топ 5 инженеров:')
print(top_engineers)
# инженер 18
# бакалавр 4
# инженер-механик 3
# инженер-электрик 2
# менеджер 2

def prepare_csv():
from random import randint
rename = {
'salary': 'salary',
'educationType': 'educationType',
'workExperienceList/workExperience[1]/jobTitle': 'jobTitle',
"educationList/educationType[1]/qualification": 'qualification',
'gender': 'gender',
'innerInfo/dateModify': 'dateModify',
"skills": "skills",
"otherInfo": "otherInfo"
}
reader = XmlReader.ReadByChunk("big_xml.xml")
with open('works.csv', 'w', newline='') as csvfile:
fieldnames = list(rename.values())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

for tag in tqdm(reader.tags()):
props = defaultdict(str)
for k, v in rename.items():
actual_tag = tag.find(k)
if actual_tag is None:
continue
props[v] = actual_tag.text

if randint(0, 100) == 0:
writer.writerow(props)


if __name__ == "__main__":
main()
Loading