diff --git a/private_gpt/components/ingest/readers/rdfreader.py b/private_gpt/components/ingest/readers/rdfreader.py index 8096b397..3bc3ac99 100644 --- a/private_gpt/components/ingest/readers/rdfreader.py +++ b/private_gpt/components/ingest/readers/rdfreader.py @@ -46,7 +46,7 @@ class RDFReader(BaseReader): if len(labels) > 0: return labels[0].value - raise Exception(f"Label not found for: {uri}") + return None # Return None if label not found def load_data(self, file: Path, extra_info: dict | None = None) -> list[Document]: """Parse file.""" @@ -64,14 +64,19 @@ class RDFReader(BaseReader): for s, p, o in self.g_local: if p == RDFS.label: continue - print(s, p, o) - triple = ( - f"<{self.fetch_label_in_graphs(s, lang=lang)}> " - f"<{self.fetch_label_in_graphs(p, lang=lang)}> " - f"<{self.fetch_label_in_graphs(o, lang=lang)}>" - ) + + subj_label = self.fetch_label_in_graphs(s, lang=lang) + pred_label = self.fetch_label_in_graphs(p, lang=lang) + obj_label = self.fetch_label_in_graphs(o, lang=lang) + + if subj_label is None or pred_label is None or obj_label is None: + continue + + triple = f"<{subj_label}> " f"<{pred_label}> " f"<{obj_label}>" text_list.append(triple) text = "\n".join(text_list) + return [self._text_to_document(text, extra_info)] - return [Document(text, extra_info=extra_info)] + def _text_to_document(self, text: str, extra_info: dict | None = None) -> Document: + return Document(text=text, extra_info=extra_info or {}) diff --git a/tests/server/ingest/test.ttl b/tests/server/ingest/test.ttl new file mode 100644 index 00000000..99a74338 --- /dev/null +++ b/tests/server/ingest/test.ttl @@ -0,0 +1,358 @@ +@prefix ns1: . +@prefix rdfs: . + +ns1:Q1044339 rdfs:label "Valeriano Balloni" ; + ns1:hasTeam ns1:Q13385, + ns1:Q289029, + ns1:Q297430, + ns1:Q650365, + ns1:Q6767 . + +ns1:Q110992321 rdfs:label "Tim Karius" ; + ns1:hasTeam ns1:Q1387210, + ns1:Q655591 . + +ns1:Q12402730 rdfs:label "Xoaquín Álvarez Corbacho" ; + ns1:hasTeam ns1:Q8749 . + +ns1:Q12813965 rdfs:label "József Cserháti" ; + ns1:hasTeam ns1:Q732885 . + +ns1:Q13101502 rdfs:label "Alphonse Weicker" ; + ns1:hasTeam ns1:Q184266, + ns1:Q693092 . + +ns1:Q1497593 rdfs:label "Gediminas Budnikas" ; + ns1:hasTeam ns1:Q393357 . + +ns1:Q1531063 rdfs:label "Glenn W. Harrison" ; + ns1:hasTeam ns1:Q1034556 . + +ns1:Q16081110 rdfs:label "Moon Hyung-pyo" ; + ns1:hasTeam ns1:Q39988 . + +ns1:Q16091117 rdfs:label "Lee Ju-yeol" ; + ns1:hasTeam ns1:Q39988 . + +ns1:Q16299411 rdfs:label "Juha Joenväärä" ; + ns1:hasTeam ns1:Q1130636, + ns1:Q1232297 . + +ns1:Q1686485 rdfs:label "Jeff Immelt" ; + ns1:hasTeam ns1:Q5225674 . + +ns1:Q16942062 rdfs:label "Tito Montaño" ; + ns1:hasTeam ns1:Q127925 . + +ns1:Q1776728 rdfs:label "Svein Gjedrem" ; + ns1:hasTeam ns1:Q737937 . + +ns1:Q17917747 rdfs:label "Noel Newton Nethersole" ; + ns1:hasTeam ns1:Q3590248 . + +ns1:Q18541191 rdfs:label "Adalbert Kassai" ; + ns1:hasTeam ns1:Q1135735, + ns1:Q1195647, + ns1:Q1386940, + ns1:Q1689705, + ns1:Q841245, + ns1:Q842134 . + +ns1:Q18562973 rdfs:label "István Hagelmayer" ; + ns1:hasTeam ns1:Q606773 . + +ns1:Q192533 rdfs:label "Mark Carney" ; + ns1:hasTeam ns1:Q5676342 . + +ns1:Q1930105 rdfs:label "Michaela Vosbeck" ; + ns1:hasTeam ns1:Q1715018, + ns1:Q1792079, + ns1:Q2931573, + ns1:Q300032 . + +ns1:Q202693 rdfs:label "Jo Nesbø" ; + ns1:hasTeam ns1:Q208552 . + +ns1:Q2055385 rdfs:label "Alexandre Baptista" ; + ns1:hasTeam ns1:Q267245, + ns1:Q75729 . + +ns1:Q22003558 rdfs:label "Colin Cannonier" ; + ns1:hasTeam ns1:Q3590581 . + +ns1:Q2535499 rdfs:label "Tadao Horie" ; + ns1:hasTeam ns1:Q170566 . + +ns1:Q27491470 rdfs:label "Telesfor Banaszkiewicz" ; + ns1:hasTeam ns1:Q11821053, + ns1:Q1198772 . + +ns1:Q30308976 rdfs:label "Thomas Howden Fraser" ; + ns1:hasTeam ns1:Q117467 . + +ns1:Q311025 rdfs:label "Henry Paulson" ; + ns1:hasTeam ns1:Q5225674 . + +ns1:Q3132658 rdfs:label "Henry Braddon" ; + ns1:hasTeam ns1:Q55801 . + +ns1:Q313682 rdfs:label "Oleguer Presas" ; + ns1:hasTeam ns1:Q10467, + ns1:Q17228, + ns1:Q2220788, + ns1:Q7156, + ns1:Q81888 . + +ns1:Q3470333 rdfs:label "Salvador Servià i Costa" ; + ns1:hasTeam ns1:Q188217, + ns1:Q35896 . + +ns1:Q354317 rdfs:label "Vebjørn Rodal" ; + ns1:hasTeam ns1:Q11993950 . + +ns1:Q3592042 rdfs:label "Étienne Antonelli" ; + ns1:hasTeam ns1:Q132885 . + +ns1:Q3808555 rdfs:label "Joan Trayter" ; + ns1:hasTeam ns1:Q3091261 . + +ns1:Q4011129 rdfs:label "Vicente Locaso" ; + ns1:hasTeam ns1:Q15799, + ns1:Q18640, + ns1:Q327172, + ns1:Q79800 . + +ns1:Q457755 rdfs:label "Alfred Lawson" ; + ns1:hasTeam ns1:Q461595, + ns1:Q653772 . + +ns1:Q4908745 rdfs:label "Bill Demory" ; + ns1:hasTeam ns1:Q219602, + ns1:Q4791461 . + +ns1:Q4939229 rdfs:label "Bolesław Banaś" ; + ns1:hasTeam ns1:Q3593958 . + +ns1:Q4961008 rdfs:label "Brendan Menton, Sr." ; + ns1:hasTeam ns1:Q629300 . + +ns1:Q4968933 rdfs:label "Rune Gerhardsen" ; + ns1:hasTeam ns1:Q2042878 . + +ns1:Q5405396 rdfs:label "Alejandro Brand" ; + ns1:hasTeam ns1:Q212564, + ns1:Q391984 . + +ns1:Q559712 rdfs:label "Magomedsalam Magomedov" ; + ns1:hasTeam ns1:Q2494171 . + +ns1:Q60735037 rdfs:label "Peter Morgan" ; + ns1:hasTeam ns1:Q18516 . + +ns1:Q6148645 rdfs:label "Tomás Soley Güell" ; + ns1:hasTeam ns1:Q7156 . + +ns1:Q65624037 rdfs:label "Thomas Staub" ; + ns1:hasTeam ns1:Q201969 . + +ns1:Q6708659 rdfs:label "Lyndhurst Falkiner Giblin" ; + ns1:hasTeam ns1:Q378628 . + +ns1:Q7172847 rdfs:label "Peter Henry" ; + ns1:hasTeam ns1:Q7054630 . + +ns1:Q7193582 rdfs:label "Pike Curtin" ; + ns1:hasTeam ns1:Q3589750 . + +ns1:Q732476 rdfs:label "Xavier Sala-i-Martin" ; + ns1:hasTeam ns1:Q3091261 . + +ns1:Q7436183 rdfs:label "Scott Cowen" ; + ns1:hasTeam ns1:Q16959086 . + +ns1:Q75748 rdfs:label "Hans Tietmeyer" ; + ns1:hasTeam ns1:Q2385504 . + +ns1:Q769073 rdfs:label "W. Morrissey" ; + ns1:hasTeam ns1:Q2367373 . + +ns1:Q84218605 rdfs:label "José María Echevarría Arteche" ; + ns1:hasTeam ns1:Q1103198 . + +ns1:Q8667562 rdfs:label "Valerijonas Balčiūnas" ; + ns1:hasTeam ns1:Q186276 . + +ns1:Q89141301 rdfs:label "Anna Potok" ; + ns1:hasTeam ns1:Q4841 . + +ns1:Q9199508 rdfs:label "Czesława Pilarska" ; + ns1:hasTeam ns1:Q11733016 . + +ns1:Q947814 rdfs:label "Steinar Hoen" ; + ns1:hasTeam ns1:Q4573629 . + +ns1:Q963421 rdfs:label "Carl-Henric Svanberg" ; + ns1:hasTeam ns1:Q1653574 . + +ns1:Q98072140 rdfs:label "Q98072140" ; + ns1:hasTeam ns1:Q28214543 . + +ns1:Q1034556 rdfs:label "Hawthorn Football Club" . + +ns1:Q10467 rdfs:label "FC Barcelona Atlètic" . + +ns1:Q1103198 rdfs:label "Club de Campo Villa de Madrid" . + +ns1:Q1130636 rdfs:label "Oulun Kärpät" . + +ns1:Q1135735 rdfs:label "CS Corvinul Hunedoara" . + +ns1:Q11733016 rdfs:label "Stilon Gorzów Wielkopolski" . + +ns1:Q117467 rdfs:label "Royal Society of Edinburgh" . + +ns1:Q11821053 rdfs:label "Q11821053" . + +ns1:Q1195647 rdfs:label "FC Progresul București" . + +ns1:Q1198772 rdfs:label "Warta Poznań" . + +ns1:Q11993950 rdfs:label "Oppdal IL" . + +ns1:Q1232297 rdfs:label "Djurgårdens IF Hockey" . + +ns1:Q127925 rdfs:label "Club Aurora" . + +ns1:Q132885 rdfs:label "Olympique de Marseille" . + +ns1:Q13385 rdfs:label "Società Polisportiva Ars et Labor" . + +ns1:Q1386940 rdfs:label "FC Bihor Oradea" . + +ns1:Q1387210 rdfs:label "FC Jeunesse Canach" . + +ns1:Q15799 rdfs:label "Club Atlético River Plate" . + +ns1:Q1653574 rdfs:label "IF Björklöven" . + +ns1:Q1689705 rdfs:label "FC Jiul Petroșani" . + +ns1:Q16959086 rdfs:label "UConn Huskies football" . + +ns1:Q170566 rdfs:label "Japan national football team" . + +ns1:Q1715018 rdfs:label "TV Hörde" . + +ns1:Q17228 rdfs:label "Catalonia national football team" . + +ns1:Q1792079 rdfs:label "VC Schwerte" . + +ns1:Q184266 rdfs:label "Luxembourg national football team" . + +ns1:Q18516 rdfs:label "Hereford United F.C." . + +ns1:Q186276 rdfs:label "Lithuania national football team" . + +ns1:Q18640 rdfs:label "Gimnasia y Esgrima La Plata" . + +ns1:Q188217 rdfs:label "SEAT" . + +ns1:Q201969 rdfs:label "FC Winterthur" . + +ns1:Q2042878 rdfs:label "Aktiv SK" . + +ns1:Q208552 rdfs:label "Molde FK" . + +ns1:Q212564 rdfs:label "Colombia national football team" . + +ns1:Q219602 rdfs:label "New York Jets" . + +ns1:Q2220788 rdfs:label "UDA Gramenet" . + +ns1:Q2367373 rdfs:label "NYU Violets" . + +ns1:Q2385504 rdfs:label "Q2385504" . + +ns1:Q2494171 rdfs:label "FC Dynamo Makhachkala" . + +ns1:Q267245 rdfs:label "Portugal national association football team" . + +ns1:Q28214543 rdfs:label "Trabzonspor" . + +ns1:Q289029 rdfs:label "U.S. Ancona" . + +ns1:Q2931573 rdfs:label "CJD Feuerbach" . + +ns1:Q297430 rdfs:label "S.S. Arezzo" . + +ns1:Q300032 rdfs:label "Germany women's national volleyball team" . + +ns1:Q327172 rdfs:label "Club Atlético Huracán" . + +ns1:Q35896 rdfs:label "Lancia" . + +ns1:Q3589750 rdfs:label "Western Australia cricket team" . + +ns1:Q3590248 rdfs:label "Jamaica national cricket team" . + +ns1:Q3590581 rdfs:label "Leeward Islands cricket team" . + +ns1:Q3593958 rdfs:label "ŁKS Łódź" . + +ns1:Q378628 rdfs:label "England national rugby union team" . + +ns1:Q391984 rdfs:label "Millonarios" . + +ns1:Q393357 rdfs:label "BC Žalgiris" . + +ns1:Q4573629 rdfs:label "IK Tjalve" . + +ns1:Q461595 rdfs:label "Atlanta Braves" . + +ns1:Q4791461 rdfs:label "Arizona Wildcats football" . + +ns1:Q4841 rdfs:label "Lech Poznań" . + +ns1:Q55801 rdfs:label "New Zealand national rugby union team" . + +ns1:Q5676342 rdfs:label "Harvard Crimson men's ice hockey" . + +ns1:Q606773 rdfs:label "Dorogi FC" . + +ns1:Q629300 rdfs:label "Home Farm F.C." . + +ns1:Q650365 rdfs:label "Carrarese Calcio" . + +ns1:Q653772 rdfs:label "Pittsburgh Pirates" . + +ns1:Q655591 rdfs:label "FC Koeppchen Wormeldange" . + +ns1:Q6767 rdfs:label "U.S. Livorno 1915" . + +ns1:Q693092 rdfs:label "Racing FC Union Luxembourg" . + +ns1:Q7054630 rdfs:label "North Carolina Tar Heels football" . + +ns1:Q732885 rdfs:label "Salgótarjáni BTC" . + +ns1:Q737937 rdfs:label "Lyn 1896 FK" . + +ns1:Q75729 rdfs:label "Sporting CP" . + +ns1:Q79800 rdfs:label "Argentina national association football team" . + +ns1:Q81888 rdfs:label "AFC Ajax" . + +ns1:Q841245 rdfs:label "FC Argeș" . + +ns1:Q842134 rdfs:label "FC Sportul Studențesc București" . + +ns1:Q8749 rdfs:label "RC Celta de Vigo" . + +ns1:Q3091261 rdfs:label "FC Barcelona" . + +ns1:Q39988 rdfs:label "Yonsei University" . + +ns1:Q5225674 rdfs:label "Dartmouth Big Green football" . + +ns1:Q7156 rdfs:label "FC Barcelona" . diff --git a/tests/server/ingest/test_ingest_routes.py b/tests/server/ingest/test_ingest_routes.py index 896410a1..3bdba834 100644 --- a/tests/server/ingest/test_ingest_routes.py +++ b/tests/server/ingest/test_ingest_routes.py @@ -19,6 +19,12 @@ def test_ingest_accepts_pdf_files(ingest_helper: IngestHelper) -> None: assert len(ingest_result.data) == 1 +def test_ingest_accepts_ttf_files(ingest_helper: IngestHelper) -> None: + path = Path(__file__).parents[0] / "test.ttl" + ingest_result = ingest_helper.ingest_file(path) + assert len(ingest_result.data) == 1 + + def test_ingest_list_returns_something_after_ingestion( test_client: TestClient, ingest_helper: IngestHelper ) -> None: