From f3db2aca5fd663a2c9257501a34be71a19d0036a Mon Sep 17 00:00:00 2001
From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no>
Date: Wed, 20 Oct 2021 18:18:34 +0200
Subject: [PATCH 1/4] Module for deleting duplicate employee records

---
 cristin_ms/delete_duplicate_employees.py | 34 +++++++++++++++++++
 tests/conftest.py                        |  5 +++
 tests/fixtures/person_duplicate.json     | 43 ++++++++++++++++++++++++
 tests/test_delete_duplicate_employees.py | 32 ++++++++++++++++++
 4 files changed, 114 insertions(+)
 create mode 100644 cristin_ms/delete_duplicate_employees.py
 create mode 100644 tests/fixtures/person_duplicate.json
 create mode 100644 tests/test_delete_duplicate_employees.py

diff --git a/cristin_ms/delete_duplicate_employees.py b/cristin_ms/delete_duplicate_employees.py
new file mode 100644
index 0000000..528d951
--- /dev/null
+++ b/cristin_ms/delete_duplicate_employees.py
@@ -0,0 +1,34 @@
+"""
+Deletes duplicate records for the employees based on their person
+number (fødselsnummer). To run: "python3 -m cristin_ms.delete_duplicate_employees"
+"""
+import logging
+from typing import Optional
+
+from cristin_ms.cristin_export import get_context
+#from cristin_ms.config import CristinMsConfig
+from cristin_ms.context import CristinMsContext
+from cristin_ms.database import Info
+
+
+logger = logging.getLogger(__name__)
+
+
+def delete_duplicates(cristin_ms_context: Optional[CristinMsContext] = None):
+    ctx = cristin_ms_context or get_context()
+
+    all_employees = ctx.info.get_all_by_type(Info.TYPE_EMPLOYEE)
+    all_pers_numbers = []
+    for emp in all_employees:
+        pers_number = emp.data.get('fnr', None)
+        if not pers_number:
+            logger.info(f'Deleting employee {emp.data["id"]} because of empty personal number')
+            ctx.info.delete(emp)
+        if not pers_number in all_pers_numbers:
+            all_pers_numbers.append(pers_number)
+        else:
+            logger.info(f'Deleting duplicate entry for employee with personal number {pers_number}')
+            ctx.info.delete(emp)
+
+if __name__ == '__main__':
+    delete_duplicates()
diff --git a/tests/conftest.py b/tests/conftest.py
index 4384463..fed8d26 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -185,6 +185,11 @@ def person_00101219():
     return load_json_file('person_00101219.json')
 
 
+@pytest.fixture
+def person_duplicate():
+    return load_json_file('person_duplicate.json')
+
+
 @pytest.fixture
 def stilling_30000004():
     return load_json_file('stilling_30000004.json')
diff --git a/tests/fixtures/person_duplicate.json b/tests/fixtures/person_duplicate.json
new file mode 100644
index 0000000..1ea1db2
--- /dev/null
+++ b/tests/fixtures/person_duplicate.json
@@ -0,0 +1,43 @@
+{
+  "id": "00101235",
+  "brukerident": "olgaboe",
+  "dfoBrukerident": "9900olgaboe",
+  "fornavn": "Olga",
+  "etternavn": "Bøe",
+  "fnr": "26012199693",
+  "annenId": {
+    "idType": "02",
+    "idBeskrivelse": "Passnummer",
+    "idNr": "JKLFJLDSJAFØ",
+    "idStartdato": "1965-12-01",
+    "idSluttdato": "9999-12-31"
+  },
+  "fdato": "1965-12-01",
+  "kjonn": "F",
+  "landkode": "NO",
+  "medarbeidergruppe": "4",
+  "medarbeiderundergruppe": "01",
+  "startdato": "2020-10-01",
+  "sluttdato": "9999-12-31",
+  "sluttarsak": null,
+  "stillingId": 30000004,
+  "stillingsprosent": "100.00",
+  "bevilgning": null,
+  "kostnadssted": null,
+  "organisasjonId": 10000018,
+  "jurBedriftsnummer": 912345678,
+  "lederflagg": false,
+  "portaltilgang": true,
+  "turnustilgang": false,
+  "eksternbruker": false,
+  "epost": null,
+  "privatTelefonnummer": null,
+  "telefonnummer": null,
+  "mobilnummer": null,
+  "privatPostadresse": "Trondheimsvei 66",
+  "privatPostnr": "7014",
+  "privatPoststed": "TRONDHEIM",
+  "endretDato": "2020-03-17",
+  "endretAv": "3-ANBE",
+  "tilleggsstilling": []
+}
diff --git a/tests/test_delete_duplicate_employees.py b/tests/test_delete_duplicate_employees.py
new file mode 100644
index 0000000..2d49d63
--- /dev/null
+++ b/tests/test_delete_duplicate_employees.py
@@ -0,0 +1,32 @@
+import pytest
+
+from dfo_sap_client.models import Ansatt
+from sqlalchemy import func
+
+from cristin_ms.delete_duplicate_employees import delete_duplicates
+from cristin_ms.database import Info
+
+def test_delete_duplicates(cristin_ms_context,
+              			   person_00101234,
+              			   person_duplicate):
+    service = cristin_ms_context.info
+
+    data = Ansatt(**person_00101234).dict()
+    info = Info(type=Info.TYPE_EMPLOYEE,
+                data=data,
+                fetched_at=func.now())
+    service.save(info)
+
+    # A duplicate that should be deleted below
+    data2 = data=Ansatt(**person_duplicate).dict()
+    info2 = Info(type=Info.TYPE_EMPLOYEE,
+                 data=data2,
+                 fetched_at=func.now())
+    service.save(info2)
+    
+    delete_duplicates(cristin_ms_context)
+
+    existing_record = service.get_by_json_field_query('employee',
+				                                      "fnr",
+				                                      data['fnr'])
+    assert len(existing_record) == 1
-- 
GitLab


From 0f9330adc73207f3de07cfda361b2db5f48ea994 Mon Sep 17 00:00:00 2001
From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no>
Date: Wed, 20 Oct 2021 18:27:44 +0200
Subject: [PATCH 2/4] Fix style

---
 tests/test_delete_duplicate_employees.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_delete_duplicate_employees.py b/tests/test_delete_duplicate_employees.py
index 2d49d63..6b318a8 100644
--- a/tests/test_delete_duplicate_employees.py
+++ b/tests/test_delete_duplicate_employees.py
@@ -7,8 +7,8 @@ from cristin_ms.delete_duplicate_employees import delete_duplicates
 from cristin_ms.database import Info
 
 def test_delete_duplicates(cristin_ms_context,
-              			   person_00101234,
-              			   person_duplicate):
+                           person_00101234,
+                           person_duplicate):
     service = cristin_ms_context.info
 
     data = Ansatt(**person_00101234).dict()
@@ -27,6 +27,6 @@ def test_delete_duplicates(cristin_ms_context,
     delete_duplicates(cristin_ms_context)
 
     existing_record = service.get_by_json_field_query('employee',
-				                                      "fnr",
-				                                      data['fnr'])
+                                                      "fnr",
+                                                      data['fnr'])
     assert len(existing_record) == 1
-- 
GitLab


From 66c1af3c002c20ba1b708a57a68cda58d6f143d3 Mon Sep 17 00:00:00 2001
From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no>
Date: Thu, 21 Oct 2021 14:08:12 +0200
Subject: [PATCH 3/4] Minor fix

---
 cristin_ms/delete_duplicate_employees.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cristin_ms/delete_duplicate_employees.py b/cristin_ms/delete_duplicate_employees.py
index 528d951..8218fde 100644
--- a/cristin_ms/delete_duplicate_employees.py
+++ b/cristin_ms/delete_duplicate_employees.py
@@ -18,14 +18,14 @@ def delete_duplicates(cristin_ms_context: Optional[CristinMsContext] = None):
     ctx = cristin_ms_context or get_context()
 
     all_employees = ctx.info.get_all_by_type(Info.TYPE_EMPLOYEE)
-    all_pers_numbers = []
+    all_pers_numbers = set()
     for emp in all_employees:
         pers_number = emp.data.get('fnr', None)
         if not pers_number:
             logger.info(f'Deleting employee {emp.data["id"]} because of empty personal number')
             ctx.info.delete(emp)
         if not pers_number in all_pers_numbers:
-            all_pers_numbers.append(pers_number)
+            all_pers_numbers.add(pers_number)
         else:
             logger.info(f'Deleting duplicate entry for employee with personal number {pers_number}')
             ctx.info.delete(emp)
-- 
GitLab


From 7078cdc34e69e20f7a867a03a650d824359f3ac8 Mon Sep 17 00:00:00 2001
From: Petr Kalashnikov <pka065@it6100016.klientdrift.uib.no>
Date: Thu, 28 Oct 2021 10:36:10 +0200
Subject: [PATCH 4/4] Style fixes

---
 cristin_ms/delete_duplicate_employees.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cristin_ms/delete_duplicate_employees.py b/cristin_ms/delete_duplicate_employees.py
index 8218fde..818f9d4 100644
--- a/cristin_ms/delete_duplicate_employees.py
+++ b/cristin_ms/delete_duplicate_employees.py
@@ -6,7 +6,6 @@ import logging
 from typing import Optional
 
 from cristin_ms.cristin_export import get_context
-#from cristin_ms.config import CristinMsConfig
 from cristin_ms.context import CristinMsContext
 from cristin_ms.database import Info
 
@@ -24,11 +23,12 @@ def delete_duplicates(cristin_ms_context: Optional[CristinMsContext] = None):
         if not pers_number:
             logger.info(f'Deleting employee {emp.data["id"]} because of empty personal number')
             ctx.info.delete(emp)
-        if not pers_number in all_pers_numbers:
+        if pers_number not in all_pers_numbers:
             all_pers_numbers.add(pers_number)
         else:
             logger.info(f'Deleting duplicate entry for employee with personal number {pers_number}')
             ctx.info.delete(emp)
 
+
 if __name__ == '__main__':
     delete_duplicates()
-- 
GitLab