From f3db2aca5fd663a2c9257501a34be71a19d0036a Mon Sep 17 00:00:00 2001 From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no> Date: Wed, 20 Oct 2021 18:18:34 +0200 Subject: [PATCH 1/4] Module for deleting duplicate employee records --- cristin_ms/delete_duplicate_employees.py | 34 +++++++++++++++++++ tests/conftest.py | 5 +++ tests/fixtures/person_duplicate.json | 43 ++++++++++++++++++++++++ tests/test_delete_duplicate_employees.py | 32 ++++++++++++++++++ 4 files changed, 114 insertions(+) create mode 100644 cristin_ms/delete_duplicate_employees.py create mode 100644 tests/fixtures/person_duplicate.json create mode 100644 tests/test_delete_duplicate_employees.py diff --git a/cristin_ms/delete_duplicate_employees.py b/cristin_ms/delete_duplicate_employees.py new file mode 100644 index 0000000..528d951 --- /dev/null +++ b/cristin_ms/delete_duplicate_employees.py @@ -0,0 +1,34 @@ +""" +Deletes duplicate records for the employees based on their person +number (fødselsnummer). To run: "python3 -m cristin_ms.delete_duplicate_employees" +""" +import logging +from typing import Optional + +from cristin_ms.cristin_export import get_context +#from cristin_ms.config import CristinMsConfig +from cristin_ms.context import CristinMsContext +from cristin_ms.database import Info + + +logger = logging.getLogger(__name__) + + +def delete_duplicates(cristin_ms_context: Optional[CristinMsContext] = None): + ctx = cristin_ms_context or get_context() + + all_employees = ctx.info.get_all_by_type(Info.TYPE_EMPLOYEE) + all_pers_numbers = [] + for emp in all_employees: + pers_number = emp.data.get('fnr', None) + if not pers_number: + logger.info(f'Deleting employee {emp.data["id"]} because of empty personal number') + ctx.info.delete(emp) + if not pers_number in all_pers_numbers: + all_pers_numbers.append(pers_number) + else: + logger.info(f'Deleting duplicate entry for employee with personal number {pers_number}') + ctx.info.delete(emp) + +if __name__ == '__main__': + delete_duplicates() diff --git a/tests/conftest.py b/tests/conftest.py index 4384463..fed8d26 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -185,6 +185,11 @@ def person_00101219(): return load_json_file('person_00101219.json') +@pytest.fixture +def person_duplicate(): + return load_json_file('person_duplicate.json') + + @pytest.fixture def stilling_30000004(): return load_json_file('stilling_30000004.json') diff --git a/tests/fixtures/person_duplicate.json b/tests/fixtures/person_duplicate.json new file mode 100644 index 0000000..1ea1db2 --- /dev/null +++ b/tests/fixtures/person_duplicate.json @@ -0,0 +1,43 @@ +{ + "id": "00101235", + "brukerident": "olgaboe", + "dfoBrukerident": "9900olgaboe", + "fornavn": "Olga", + "etternavn": "Bøe", + "fnr": "26012199693", + "annenId": { + "idType": "02", + "idBeskrivelse": "Passnummer", + "idNr": "JKLFJLDSJAFØ", + "idStartdato": "1965-12-01", + "idSluttdato": "9999-12-31" + }, + "fdato": "1965-12-01", + "kjonn": "F", + "landkode": "NO", + "medarbeidergruppe": "4", + "medarbeiderundergruppe": "01", + "startdato": "2020-10-01", + "sluttdato": "9999-12-31", + "sluttarsak": null, + "stillingId": 30000004, + "stillingsprosent": "100.00", + "bevilgning": null, + "kostnadssted": null, + "organisasjonId": 10000018, + "jurBedriftsnummer": 912345678, + "lederflagg": false, + "portaltilgang": true, + "turnustilgang": false, + "eksternbruker": false, + "epost": null, + "privatTelefonnummer": null, + "telefonnummer": null, + "mobilnummer": null, + "privatPostadresse": "Trondheimsvei 66", + "privatPostnr": "7014", + "privatPoststed": "TRONDHEIM", + "endretDato": "2020-03-17", + "endretAv": "3-ANBE", + "tilleggsstilling": [] +} diff --git a/tests/test_delete_duplicate_employees.py b/tests/test_delete_duplicate_employees.py new file mode 100644 index 0000000..2d49d63 --- /dev/null +++ b/tests/test_delete_duplicate_employees.py @@ -0,0 +1,32 @@ +import pytest + +from dfo_sap_client.models import Ansatt +from sqlalchemy import func + +from cristin_ms.delete_duplicate_employees import delete_duplicates +from cristin_ms.database import Info + +def test_delete_duplicates(cristin_ms_context, + person_00101234, + person_duplicate): + service = cristin_ms_context.info + + data = Ansatt(**person_00101234).dict() + info = Info(type=Info.TYPE_EMPLOYEE, + data=data, + fetched_at=func.now()) + service.save(info) + + # A duplicate that should be deleted below + data2 = data=Ansatt(**person_duplicate).dict() + info2 = Info(type=Info.TYPE_EMPLOYEE, + data=data2, + fetched_at=func.now()) + service.save(info2) + + delete_duplicates(cristin_ms_context) + + existing_record = service.get_by_json_field_query('employee', + "fnr", + data['fnr']) + assert len(existing_record) == 1 -- GitLab From 0f9330adc73207f3de07cfda361b2db5f48ea994 Mon Sep 17 00:00:00 2001 From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no> Date: Wed, 20 Oct 2021 18:27:44 +0200 Subject: [PATCH 2/4] Fix style --- tests/test_delete_duplicate_employees.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_delete_duplicate_employees.py b/tests/test_delete_duplicate_employees.py index 2d49d63..6b318a8 100644 --- a/tests/test_delete_duplicate_employees.py +++ b/tests/test_delete_duplicate_employees.py @@ -7,8 +7,8 @@ from cristin_ms.delete_duplicate_employees import delete_duplicates from cristin_ms.database import Info def test_delete_duplicates(cristin_ms_context, - person_00101234, - person_duplicate): + person_00101234, + person_duplicate): service = cristin_ms_context.info data = Ansatt(**person_00101234).dict() @@ -27,6 +27,6 @@ def test_delete_duplicates(cristin_ms_context, delete_duplicates(cristin_ms_context) existing_record = service.get_by_json_field_query('employee', - "fnr", - data['fnr']) + "fnr", + data['fnr']) assert len(existing_record) == 1 -- GitLab From 66c1af3c002c20ba1b708a57a68cda58d6f143d3 Mon Sep 17 00:00:00 2001 From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no> Date: Thu, 21 Oct 2021 14:08:12 +0200 Subject: [PATCH 3/4] Minor fix --- cristin_ms/delete_duplicate_employees.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cristin_ms/delete_duplicate_employees.py b/cristin_ms/delete_duplicate_employees.py index 528d951..8218fde 100644 --- a/cristin_ms/delete_duplicate_employees.py +++ b/cristin_ms/delete_duplicate_employees.py @@ -18,14 +18,14 @@ def delete_duplicates(cristin_ms_context: Optional[CristinMsContext] = None): ctx = cristin_ms_context or get_context() all_employees = ctx.info.get_all_by_type(Info.TYPE_EMPLOYEE) - all_pers_numbers = [] + all_pers_numbers = set() for emp in all_employees: pers_number = emp.data.get('fnr', None) if not pers_number: logger.info(f'Deleting employee {emp.data["id"]} because of empty personal number') ctx.info.delete(emp) if not pers_number in all_pers_numbers: - all_pers_numbers.append(pers_number) + all_pers_numbers.add(pers_number) else: logger.info(f'Deleting duplicate entry for employee with personal number {pers_number}') ctx.info.delete(emp) -- GitLab From 7078cdc34e69e20f7a867a03a650d824359f3ac8 Mon Sep 17 00:00:00 2001 From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no> Date: Thu, 28 Oct 2021 10:36:10 +0200 Subject: [PATCH 4/4] Style fixes --- cristin_ms/delete_duplicate_employees.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cristin_ms/delete_duplicate_employees.py b/cristin_ms/delete_duplicate_employees.py index 8218fde..818f9d4 100644 --- a/cristin_ms/delete_duplicate_employees.py +++ b/cristin_ms/delete_duplicate_employees.py @@ -6,7 +6,6 @@ import logging from typing import Optional from cristin_ms.cristin_export import get_context -#from cristin_ms.config import CristinMsConfig from cristin_ms.context import CristinMsContext from cristin_ms.database import Info @@ -24,11 +23,12 @@ def delete_duplicates(cristin_ms_context: Optional[CristinMsContext] = None): if not pers_number: logger.info(f'Deleting employee {emp.data["id"]} because of empty personal number') ctx.info.delete(emp) - if not pers_number in all_pers_numbers: + if pers_number not in all_pers_numbers: all_pers_numbers.add(pers_number) else: logger.info(f'Deleting duplicate entry for employee with personal number {pers_number}') ctx.info.delete(emp) + if __name__ == '__main__': delete_duplicates() -- GitLab