Coverage for src / crawler / by_source / nsjom / nsjom_1971_crawler.py: 85%
95 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-03 09:36 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-03 09:36 +0000
1import re
2import typing
3from datetime import datetime
4from urllib.parse import urljoin
6from bs4 import BeautifulSoup, Tag
7from ptf.model_data import (
8 IssueData,
9 create_articledata,
10 create_contributor,
11 create_extlink,
12 create_issuedata,
13)
15from crawler.crawler_utils import set_pages
16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
18if typing.TYPE_CHECKING:
19 from .nsjom_crawler import NsjomCrawler
22source_domain = "NSJOM"
25def parse_collection_content(
26 self: "NsjomCrawler",
27 content: str,
28 source_domain: str = "NSJOM",
29):
30 """
31 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
32 From 1971 to 2009 (included)
33 """
34 xissues: list[IssueData] = []
35 year_start = 1971
36 year_end = min(2009, datetime.now().year)
38 for year in range(year_start, year_end + 1):
39 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
40 year_content = self.download_file(url)
41 try:
42 xissues = xissues + parse_year(self, year_content, year, url)
43 except ValueError as e:
44 # Adds message to printed error
45 e.add_note(f"[{source_domain}]: {year}")
46 raise
47 return xissues
50def parse_issue_content(self, content: str, xissue: IssueData):
51 if not xissue.year: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError("Issue year is not set")
53 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
56def parse_year(
57 self: "NsjomCrawler",
58 content: str,
59 year: int,
60 url: str | None = None,
61 pid_to_parse: str | None = None,
62):
63 """Parses one page.
64 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
65 """
67 soup = BeautifulSoup(content, "html.parser")
68 xissues: list[IssueData] = []
69 issues_tags = soup.select("body>table")
70 for issue_tag in issues_tags:
71 xissue = parse_issue_tag(self, issue_tag, year)
72 if not xissue: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 continue
74 xissue.url = url
75 xissues.append(xissue)
76 return xissues
79def parse_issue_tag(
80 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None
81):
82 """Parses one issue tag.
83 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
84 """
85 xissue = create_issuedata()
86 xissue.year = str(year)
87 ext_link = create_extlink(
88 rel="source",
89 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
90 metadata=source_domain,
91 )
92 xissue.ext_links.append(ext_link)
94 table_lines = issue_tag.select("tr")
95 issue_title_tag = table_lines.pop(0)
96 match = re.search(
97 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?",
98 issue_title_tag.text,
99 )
100 # Issue Summary ?
101 if match is None: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 raise ValueError("Cannot find volume number")
103 volume_number = match.group("Volume")
104 issue_number = match.group("Issue")
105 if volume_number: 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true
106 xissue.volume = volume_number
107 else:
108 raise ValueError("Cannot read volume number")
109 if issue_number:
110 xissue.number = issue_number
111 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
113 if pid_to_parse and xissue.pid != pid_to_parse: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 return
115 table_lines.pop(0) # table header
116 for index, table_line_tag in enumerate(table_lines):
117 try:
118 xarticle = parse_article_tag(
119 self,
120 table_line_tag,
121 xissue.pid,
122 index,
123 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
124 )
125 except ValueError as e:
126 e.add_note(f"{volume_number}_{issue_number}")
127 raise
128 if xarticle:
129 xissue.articles.append(xarticle)
130 return xissue
133def parse_article_tag(
134 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None
135):
136 """Parses one article tag.
137 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
138 """
139 xarticle = create_articledata()
140 if source: 140 ↛ 144line 140 didn't jump to line 144 because the condition on line 140 was always true
141 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
142 xarticle.ext_links.append(ext_link)
144 article_data = article_tag.select("td")
145 if len(article_data) != 3: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 raise ValueError("Issue table doesn't have three columns")
147 author_tag, title_tag, pages_tag = article_data
149 authors = [
150 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>")
151 ]
152 for author_name in authors:
153 if author_name == "":
154 continue
156 author_name = cleanup_str(author_name)
158 author = create_contributor(role="author", string_name=author_name)
159 xarticle.contributors.append(author)
161 title = cleanup_str(title_tag.text)
162 xarticle.title_tex = title
163 xarticle.pid = f"{issue_pid}a_{index}"
165 title_link_tag = title_tag.select_one("a")
166 if title_link_tag is None:
167 self.logger.warning(
168 f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link",
169 extra={"pid": xarticle.pid},
170 )
171 return None
173 href = self.get_str_attr(title_link_tag, "href")
174 pdf_link = urljoin(self.source_website, href)
175 add_pdf_link_to_xarticle(xarticle, pdf_link)
177 set_pages(xarticle, cleanup_str(pages_tag.text))
178 return xarticle