Coverage for src/crawler/by_source/nsjom/nsjom_1971_crawler.py: 81%
99 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import re
2import typing
3from datetime import datetime
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 IssueData,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_issuedata,
12)
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true
17 from ..nsjom_crawler import NsjomCrawler
20source_domain = "NSJOM"
23def parse_collection_content(
24 self: "NsjomCrawler",
25 content: str,
26 source_domain: str = "NSJOM",
27):
28 """
29 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
30 From 1971 to 2009 (included)
31 """
32 xissues: list[IssueData] = []
33 year_start = 1971
34 year_end = min(2009, datetime.now().year)
36 for year in range(year_start, year_end + 1):
37 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html"
38 year_content = self.download_file(url)
39 try:
40 xissues = xissues + parse_year(self, year_content, year, url)
41 except ValueError as e:
42 # Adds message to printed error
43 e.add_note(f"[{source_domain}]: {year}")
44 raise
45 return xissues
48def parse_issue_content(self, content: str, xissue: IssueData):
49 if not xissue.year: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 raise ValueError("Issue year is not set")
51 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid)
54def parse_year(
55 self: "NsjomCrawler",
56 content: str,
57 year: int,
58 url: str | None = None,
59 pid_to_parse: str | None = None,
60):
61 """Parses one page.
62 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html"""
63 soup = BeautifulSoup(content, "html.parser")
64 xissues: list[IssueData] = []
65 issues_tags = soup.select("body>table")
66 for issue_tag in issues_tags:
67 xissue = parse_issue_tag(self, issue_tag, year)
68 if not xissue: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 continue
70 xissue.url = url
71 xissues.append(xissue)
72 return xissues
75def parse_issue_tag(
76 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None
77):
78 """Parses one issue tag.
79 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
80 """
81 xissue = create_issuedata()
82 xissue.year = str(year)
83 ext_link = create_extlink(
84 rel="source",
85 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
86 metadata=source_domain,
87 )
88 xissue.ext_links.append(ext_link)
90 table_lines = issue_tag.select("tr")
91 issue_title_tag = table_lines.pop(0)
92 match = re.search(
93 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?",
94 issue_title_tag.text,
95 )
96 # Issue Summary ?
97 if match is None: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 raise ValueError("Cannot find volume number")
99 volume_number = match.group("Volume")
100 issue_number = match.group("Issue")
101 if volume_number: 101 ↛ 104line 101 didn't jump to line 104 because the condition on line 101 was always true
102 xissue.volume = volume_number
103 else:
104 raise ValueError("Cannot read volume number")
105 if issue_number:
106 xissue.number = issue_number
107 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}"
109 if pid_to_parse and xissue.pid != pid_to_parse: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 return
111 table_lines.pop(0) # table header
112 for index, table_line_tag in enumerate(table_lines):
113 try:
114 xarticle = parse_article_tag(
115 self,
116 table_line_tag,
117 xissue.pid,
118 index,
119 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}",
120 )
121 except ValueError as e:
122 e.add_note(f"{volume_number}_{issue_number}")
123 raise
124 if xarticle:
125 xissue.articles.append(xarticle)
126 return xissue
129def parse_article_tag(
130 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None
131):
132 """Parses one article tag.
133 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html
134 """
135 xarticle = create_articledata()
136 if source: 136 ↛ 140line 136 didn't jump to line 140 because the condition on line 136 was always true
137 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain)
138 xarticle.ext_links.append(ext_link)
140 article_data = article_tag.select("td")
141 if len(article_data) != 3: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 raise ValueError("Issue table doesn't have three columns")
143 author_tag, title_tag, pages_tag = article_data
145 authors = [
146 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>")
147 ]
148 for author_name in authors:
149 if author_name == "":
150 continue
152 author_name = cleanup_str(author_name)
154 author = create_contributor(role="author", string_name=author_name)
155 xarticle.contributors.append(author)
157 title = cleanup_str(title_tag.text)
158 xarticle.title_tex = title
159 xarticle.pid = f"{issue_pid}a_{index}"
161 title_link_tag = title_tag.select_one("a")
162 if title_link_tag is None:
163 print(f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link")
164 return None
165 pdf_link = title_link_tag.get("href")
166 if pdf_link is None: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 raise ValueError("Article pdf link is None")
168 if isinstance(pdf_link, list): 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 raise ValueError("Article has multiple pdf hrefs")
170 pdf_link = self.source_website + pdf_link
171 add_pdf_link_to_xarticle(xarticle, pdf_link)
173 self.set_pages(xarticle, cleanup_str(pages_tag.text))
174 return xarticle