Coverage for src/crawler/by_source/nsjom/nsjom_xml_crawler.py: 85%
124 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import re
2import typing
4from bs4 import BeautifulSoup, Tag
6# from ptf.model_data import create_publisherdata
7from ptf.model_data import (
8 IssueData,
9 create_articledata,
10 create_contributor,
11 create_extlink,
12 create_issuedata,
13 create_publisherdata,
14 create_subj,
15)
17from crawler.utils import add_pdf_link_to_xarticle
19if typing.TYPE_CHECKING: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true
20 from ..nsjom_crawler import NsjomCrawler
22source_domain = "NSJOM"
25def parse_collection_content(
26 self: "NsjomCrawler",
27 _: str,
28 source_domain: str = "NSJOM",
29 xissue_pid_to_parse: str | None = None,
30):
31 """
32 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
33 From 2015 to today
34 """
35 xissues: dict[tuple[str, str], IssueData] = {}
36 url = "https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml"
37 content = self.download_file(url)
38 soup = BeautifulSoup(content, "lxml-xml")
39 record_container_element = soup.select_one("records")
40 if record_container_element is None: 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 raise ValueError(f"[{source_domain}] Cannot parse source")
42 for record_element in record_container_element.select("record"):
43 publication_type_tag = record_element.select_one("publicationType")
44 if publication_type_tag is None: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 raise ValueError(f"[{source_domain}] Cannot determine article publicationType")
46 if publication_type_tag.text != "published":
47 continue
48 year_tag = record_element.select_one("year")
49 if year_tag is None or year_tag.text == "": 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 raise ValueError(f"[{source_domain}] Cannot parse year from article")
51 year = int(year_tag.text)
52 xarticle, volume_number, issue_number = parse_article(
53 self, record_element, source_domain=source_domain
54 )
55 if (volume_number, issue_number) not in xissues:
56 pid = f"{source_domain}_{year}__{volume_number}_{issue_number}"
57 if xissue_pid_to_parse and xissue_pid_to_parse != pid:
58 continue
59 xissue = create_issuedata()
60 parse_issue_tag(xissue, record_element, year)
61 xissue.year = year_tag.text
62 xissue.volume = volume_number
63 xissue.number = issue_number
64 xissue.pid = pid
65 xissues[(volume_number, issue_number)] = xissue
66 xissues[(volume_number, issue_number)].articles.append(xarticle)
68 return list(xissues.values())
71def parse_issue_content(self: "NsjomCrawler", content: str, xissue: IssueData):
72 if not xissue.year: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 raise ValueError("Issue year is not set")
74 return parse_collection_content(self, content, source_domain, xissue.pid)
77def parse_issue_tag(xissue: IssueData, article_tag: Tag, year: int) -> IssueData:
78 publisher_tag = article_tag.select_one("publisher")
79 if publisher_tag: 79 ↛ 84line 79 didn't jump to line 84 because the condition on line 79 was always true
80 xpub = create_publisherdata()
81 xpub.name = publisher_tag.text
82 xissue.publisher = xpub
84 ext_link = create_extlink(
85 rel="source",
86 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
87 metadata=source_domain,
88 )
89 xissue.ext_links.append(ext_link)
90 return xissue
93def parse_article(self: "NsjomCrawler", article_tag: Tag, source_domain: str = "NSJOM"):
94 xarticle = create_articledata()
96 doi_tag = article_tag.select_one("doi")
97 if doi_tag is None: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 raise ValueError(f"[{source_domain}] : Article doi not found")
99 xarticle.doi = doi_tag.text
100 xarticle.pid = re.sub("\\/\\.-", "_", doi_tag.text)
102 page_start_tag = article_tag.select_one("startPage")
103 page_end_tag = article_tag.select_one("endPage")
104 if page_start_tag: 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true
105 xarticle.fpage = page_start_tag.text
106 if page_end_tag: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was always true
107 xarticle.lpage = page_end_tag.text
109 date_published_tag = article_tag.select_one("publicationDate")
110 if date_published_tag: 110 ↛ 113line 110 didn't jump to line 113 because the condition on line 110 was always true
111 xarticle.date_published_iso_8601_date_str = date_published_tag.text
113 url_tag = article_tag.select_one("publisherRecordId")
114 if url_tag: 114 ↛ 122line 114 didn't jump to line 122 because the condition on line 114 was always true
115 ext_link = create_extlink(
116 rel="source",
117 location=f"https://sites.dmi.uns.ac.rs/nsjom/paper.html?noid={url_tag.text}",
118 metadata=source_domain,
119 )
120 xarticle.ext_links.append(ext_link)
122 title_tag = article_tag.select_one("title")
123 if title_tag: 123 ↛ 128line 123 didn't jump to line 128 because the condition on line 123 was always true
124 xarticle.title_tex = title_tag.text
126 # TODO : Affiliations ?
128 authors_container = article_tag.select_one("authors")
129 if authors_container: 129 ↛ 143line 129 didn't jump to line 143 because the condition on line 129 was always true
130 for author_tag in authors_container.select("author"):
131 author = create_contributor(role="author")
132 author_name_tag = author_tag.select_one("name")
133 if author_name_tag: 133 ↛ 135line 133 didn't jump to line 135 because the condition on line 133 was always true
134 author["string_name"] = author_name_tag.text
135 corresponding = author_tag.get("corresponding")
136 if corresponding == "1":
137 author["corresponding"] = True
138 email_tag = author_tag.select_one("email")
139 if email_tag:
140 author["email"] = email_tag.text
141 xarticle.contributors.append(author)
143 abstract_tag = article_tag.select_one("abstract")
144 if abstract_tag: 144 ↛ 156line 144 didn't jump to line 156 because the condition on line 144 was always true
145 abstract_language = abstract_tag.get("langauge", None)
146 if abstract_language is None or isinstance(abstract_language, list): 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true
147 abstract_language = "eng"
148 xarticle.abstracts.append(
149 {
150 "tag": "abstract",
151 "value_tex": abstract_tag.text,
152 "lang": abstract_language or self.detect_language(abstract_tag.text) or "und",
153 }
154 )
156 keywords_tag = article_tag.select_one("keywords")
157 if keywords_tag: 157 ↛ 167line 157 didn't jump to line 167 because the condition on line 157 was always true
158 keywords_language = keywords_tag.get("language", "eng")
159 if keywords_language is None or isinstance(keywords_language, list): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 keywords_language = "eng"
161 for kwd_tag in keywords_tag.select("keyword"):
162 subject = create_subj()
163 subject["value"] = kwd_tag.text
164 subject["lang"] = "en"
165 xarticle.kwds.append(subject)
167 msc_tag = article_tag.select_one("MSCs")
168 if msc_tag: 168 ↛ 176line 168 didn't jump to line 176 because the condition on line 168 was always true
169 for msc_subj in msc_tag.select("MSC"):
170 subject = create_subj()
171 subject["value"] = msc_subj.text
172 subject["type"] = "msc"
173 subject["lang"] = "en"
174 xarticle.kwds.append(subject)
176 pdf_location_tag = article_tag.select_one("filelocation")
177 pdf_name_tag = article_tag.select_one("file")
178 if pdf_location_tag and pdf_name_tag:
179 pdf_url = "https://sites.dmi.uns.ac.rs/nsjom/" + pdf_location_tag.text + pdf_name_tag.text
180 add_pdf_link_to_xarticle(xarticle, pdf_url)
182 volume_tag = article_tag.select_one("volume")
183 issue_tag = article_tag.select_one("issue")
184 if volume_tag is None or issue_tag is None: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 raise ValueError(
186 f"[{source_domain}] {xarticle.doi} Cannot parse volume or issue from article"
187 )
189 # Citations ?
191 return xarticle, volume_tag.text, issue_tag.text