From cf9984f92dd5cf93ad50ac23574b2d0c8cc661c4 Mon Sep 17 00:00:00 2001
From: WangK2 <kw221225@gmail.com>
Date: Fri, 3 May 2024 00:09:43 +0800
Subject: [PATCH] Rewrite cssci rules

---
 README.md            |  19 +-
 refparse/__init__.py |   2 +-
 refparse/cssci.py    | 250 ++++++++-----------------
 refparse/scopus.py   |   4 +-
 tests/test_cssci.py  | 424 +++++++++++++------------------------------
 5 files changed, 214 insertions(+), 485 deletions(-)

diff --git a/README.md b/README.md
index 6337694..3a79561 100644
--- a/README.md
+++ b/README.md
@@ -27,16 +27,13 @@ $ pip install refparse
 
 ## Return Fields
 
-|        | Web of Science | Scopus  | CSSCI*  |
+|        | Web of Science | Scopus  | CSSCI   |
 | :---:  | :---:          | :---:   | :---:   |
-| author | &check;        | &check; |         |
-| title  |                | &check; |         |
-| source | &check;        | &check; |         |
-| volume | &check;        | &check; |         |
-| issue  |                | &check; |         |
-| page   | &check;        | &check; |         |
-| year   | &check;        | &check; |         |
+| author | &check;        | &check; | &check; |
+| title  |                | &check; | &check; |
+| source | &check;        | &check; | &check; |
+| volume | &check;        | &check; | &check; |
+| issue  |                | &check; | &check; |
+| page   | &check;        | &check; | &check; |
+| year   | &check;        | &check; | &check; |
 | doi    | &check;        |         |         |
-| identifier except doi | |         | &check; |
-
-\* CSSCI will return differently depending on reference type.
diff --git a/refparse/__init__.py b/refparse/__init__.py
index a5413c3..85039d2 100644
--- a/refparse/__init__.py
+++ b/refparse/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.4.1"
+__version__ = "0.5.0"
 
 from typing import Literal, Optional
 from .wos import ParseWos
diff --git a/refparse/cssci.py b/refparse/cssci.py
index 9ec9147..9decbfd 100644
--- a/refparse/cssci.py
+++ b/refparse/cssci.py
@@ -4,202 +4,104 @@
 
 class ParseCssci:
     def __init__(self, ref: str):
-        self.ref = self.clean(ref)
-        self.dot_count = self.ref.count(".")
+        self.ref = ref
 
     @staticmethod
     def clean(ref: str) -> str:
-        return re.sub(r"^\d*\.", "", ref)
+        ref = ref.strip(".")
+        ref = re.sub(r"^\d*\.", "", ref)
+        # Remove unwanted info from newspaper ref
+        # e.g. 3.郑晋鸣.南京城东五高校建图书馆联合体.光明日报.04.24(7)
+        if re.search(r"\.\d{1,2}\.\d{1,2}(?=\(|$)", ref):
+            ref = re.split(r"\.(?=\d+\.)", ref, 1)[0]
+        return ref
 
-    def parse(self):
-        # Web resource
-        if re.search(r"\.\d{4}$", self.ref):
-            return self.parse_web()
-
-        if re.search(r"[\u4e00-\u9fa5]", self.ref):
-            if "GB/" in self.ref:
-                return self.parse_standard()
-
-            elif self.ref[-3:] == "出版社":
-                return self.parse_book()
-
-            elif ":学位论文." in self.ref:
-                return self.parse_thesis()
-
-            # Newspaper
-            elif re.search(r"[^\d]\.\d{1,2}\.\d{1,2}", self.ref):
-                return self.parse_newspaper()
-
-            # Patent 1
-            elif re.search(r"\.CN\d{9}[A-Z]$", self.ref):
-                return self.parse_patent1()
-
-            # Patent 2
-            elif re.search(r"^一种", self.ref):
-                return self.parse_patent2()
+    @staticmethod
+    def drop(ref: str) -> Optional[str]:
+        if re.search(r"^\d+\.\d", ref):
+            return None
 
-            else:
-                return self.parse_paper()
-        else:
-            return self.parse_english()
+        elif re.search(r"^\d+\.\.$", ref):
+            return None
 
-    def parse_web(self) -> dict[str, Optional[str]]:
-        if self.dot_count == 2:
-            author, title, year = self.ref.split(".")
-        elif self.dot_count > 2:
-            author = self.ref.split(".", 1)[0]
-            year = self.ref.rsplit(".", 1)[1]
-            title = self.ref.replace(author + ".", "").replace("." + year, "")
-        if author == "":
-            author = None
-        return {"type": "web", "author": author, "title": title, "year": year}
+        # e.g. 2..Campbell v. Acuff-Rose Music, Inc., 510 U. S. 569 (1994),1994
+        elif re.search(r"\d{4}\),\d{4}", ref) and re.search(r"^\d+\.\.", ref):
+            return None
 
-    def parse_standard(self) -> dict[str, Optional[str]]:
-        if "出版社" in self.ref:
-            year = None
-            if self.dot_count == 2:
-                author, title, source = self.ref.split(".")
-            elif self.dot_count > 2:
-                author = self.ref.split(".", 1)[0]
-                source = self.ref.rsplit(".", 1)[1]
-                title = self.ref.split(".", 1)[1].replace("." + source, "")
-        else:
-            source = None
-            author = self.ref.split(".", 1)[0]
-            if re.search(r",\d{4}$", self.ref):
-                year = self.ref[-4:]
-                title = self.ref.split(".", 1)[1].replace("," + year, "")
-            else:
-                year = None
-                title = self.ref.split(".", 1)[1]
+        # Drop patent
+        # e.g. 26.图书上下架机器人.CN102152293A
+        elif re.search(r"\.CN\d{9}[A-Z]$", ref):
+            return None
 
-        if author == "":
-            author = None
-        if title.startswith("GB/"):
-            identifier, title = re.split(r"[,，] ?", title, 1)
+        # e.g. 9.一种基于RFID技术的自动式图书智能盘点机器人:201620075212.0.2016-01-25
+        elif re.search(r"^\d+\.一种", ref):
+            return None
         else:
-            title, identifier = re.split(r":(?=GB)", title, 1)
-        return {
-            "type": "standard",
-            "author": author,
-            "title": title,
-            "source": source,
-            "year": year,
-            "identifier": identifier,
-        }
+            return ref
 
-    def parse_book(self) -> dict[str, Optional[str]]:
-        author, title, source = self.ref.split(".")
-        return {"type": "book", "author": author, "title": title, "source": source}
+    def extract(self, pattern: str, ref: Optional[str] = None, flags=0) -> Optional[str]:
+        if not ref:
+            ref = self.ref
+        match = re.search(pattern, ref, flags)
+        return match.group(1) if match else None
 
-    def parse_thesis(self) -> dict[str, Optional[str]]:
-        author, title, other = self.ref.split(".", 2)
-        title = title[:-5]
-        source, year = other.split(",")
-        year = year if len(year) == 4 else year[:4]
-        return {"type": "thesis", "author": author, "title": title, "source": source, "year": year}
+    def extract_author(self) -> Optional[str]:
+        if re.search(r"[A-Z]\.\.", self.ref):
+            author = self.extract(r"^(.*\.)\.")
 
-    def parse_newspaper(self) -> dict[str, Optional[str]]:
-        author, title, source, date = self.ref.split(".", 3)
-        if author == "":
+        elif re.search(r"^\.", self.ref):
             author = None
-        date = date.split("(", 1)[0]
-        return {"type": "newspaper", "author": author, "title": title, "source": source, "date": date}
-
-    def parse_patent1(self) -> dict[str, Optional[str]]:
-        title, identifier = self.ref.split(".", 1)
-        return {"type": "patent", "title": title, "identifier": identifier}
 
-    def parse_patent2(self) -> dict[str, Optional[str]]:
-        title, other = self.ref.split(":", 1)
-        identifier = other.rsplit(".", 1)[0]
-        identifier = re.sub(r"^[^\d]*(?=\d)", "", identifier)
-        return {"type": "patent", "title": title, "identifier": identifier}
-
-    def parse_paper(self) -> dict[str, Optional[str]]:
-        author, title, source, year, volume_issue = self.ref.split(".", 4)
-        if volume_issue.startswith("("):
-            volume = None
-            issue = volume_issue.strip("()")
         else:
-            volume, issue = volume_issue.split("(")
-            issue = issue.strip(")")
-        return {
-            "type": "paper",
-            "author": author,
-            "title": title,
-            "source": source,
-            "year": year,
-            "volume": volume,
-            "issue": issue,
-        }
-
-    def parse_english(self) -> dict[str, Optional[str]]:
-        def split_author_title(ref_left: str):
-            if ".." in self.ref:
-                author, title = ref_left.split("..", 1)
-                author += "."
-
+            author = self.ref.split(".", 1)[0]
+        return author
+
+    def parse(self) -> Optional[dict[str, Optional[str]]]:
+        if self.drop(self.ref):
+            self.ref = self.clean(self.ref)
+            dot_count = self.ref.count(".")
+            if dot_count == 0:
+                return {
+                    "author": None,
+                    "title": self.ref,
+                    "source": None,
+                    "year": None,
+                    "volume": None,
+                    "issue": None,
+                }
+
+            volume, issue, page = None, None, None
+            year = self.extract(r"[\.,](\d{4})\b")
+            if year:
+                volume = self.extract(r"[\.,]\d{4}\.(\d+)\b")
+                issue = self.extract(r"\((\d+)\)$")
+                if not (volume and issue):
+                    # e.g. 22.邓万云.利用Internet开拓我州科技信息服务新领域,2006:204-208
+                    page = self.extract(r":([\d-]+)$")
+
+            author = self.extract_author()
+            # 1..2021年度江苏省公共图书馆大数据统计报告
+            ref_left = re.split(r"[\.,](?=\d{4}\b)", self.ref, 1)[0]
+            if author:
+                ref_left = ref_left.replace(author + ".", "", 1)
             else:
-                dot_count = ref_left.count(".")
-                if dot_count == 1:
-                    author, title = ref_left.split(".", 1)
-
-                elif dot_count > 1:
-                    author, title = ref_left.rsplit(".", 1)
-                else:
-                    author, title = None, None
-            return author, title
-
-        # English book
-        if re.search(r":[A-Z]", self.ref):
+                ref_left = ref_left[1:]
             try:
-                ref_left, year_page = re.split(r",(?=\d{4})", self.ref, 1)
-            except ValueError:
-                ref_left = self.ref
-                year = None
-                page = None
+                title, source = ref_left.rsplit(".", 1)
+            except:
+                title = ref_left
+                source = None
             else:
-                year = year_page[:4]
-                page = year_page[5:]
-
-            if re.search(r"\.[A-Z][A-Za-z()]+:", ref_left):
-                ref_left, source = ref_left.rsplit(".", 1)
-
-            elif re.search(r"\.(?:[A-Z][A-Za-z]+ ){1,2}[A-Z][A-Za-z]+:[A-Z]", ref_left):
-                ref_left, source = ref_left.rsplit(".", 1)
-
-            else:
-                ref_left, source = ref_left.rsplit(":", 1)
-
-            author, title = split_author_title(ref_left)
+                # e.g. 4..Society 5.0——科学技术政策——内阁府.2020
+                if re.search(r"^\d", source):
+                    title = title + "." + source
+                    source = None
             return {
-                "type": "english-book",
                 "author": author,
                 "title": title,
                 "source": source,
                 "year": year,
+                "volume": volume,
+                "issue": issue,
                 "page": page,
             }
-
-        # English paper
-        ref_left, year_volume_issue = re.split(r"\.(?=\d{4})", self.ref, 1)
-        year = year_volume_issue.split(".", 1)[0]
-        issue = year_volume_issue.split("(", 1)[1].strip(")")
-        if ".(" in year_volume_issue:
-            volume = None
-        else:
-            volume = year_volume_issue.split("(", 1)[0][5:]
-
-        ref_left, source = ref_left.rsplit(".", 1)
-        author, title = split_author_title(ref_left)
-
-        return {
-            "type": "english-paper",
-            "author": author,
-            "title": title,
-            "source": source,
-            "year": year,
-            "volume": volume,
-            "issue": issue,
-        }
diff --git a/refparse/scopus.py b/refparse/scopus.py
index 653f4bb..5fc10a2 100644
--- a/refparse/scopus.py
+++ b/refparse/scopus.py
@@ -46,7 +46,7 @@ def clean(ref: str) -> str:
         # Add page symbol
         if re.search(r", \d+-\d+, \(", ref):
             if not re.search(r", \d{4}-\d{4}, ", ref):
-                match = re.search(r", (\d+-\d+), ", ref).group(1)
+                match = re.search(r", (\d+-\d+), ", ref).group(1) # type: ignore
                 if re.search(r"\d, \d+-", ref):
                     a, b = (int(i) for i in match.split("-"))
                     # Exclude possible issue
@@ -201,7 +201,7 @@ def parse_general(self) -> dict[str, Optional[str]]:
         if title == "unknown":
             # Remove other fields info
             if source:
-                repr_str = re.match(r"([A-Za-z\d\. ]{,20})", source)[1]
+                repr_str = re.match(r"([A-Za-z\d\. ]{,20})", source).group(1) # type: ignore
                 ref_left = re.sub(f", {repr_str}.*$", "", self.ref)
             elif volume:
                 ref_left = re.sub(f", {volume}.*$", "", self.ref)
diff --git a/tests/test_cssci.py b/tests/test_cssci.py
index 4aaa485..f1661de 100644
--- a/tests/test_cssci.py
+++ b/tests/test_cssci.py
@@ -3,364 +3,228 @@
 from refparse.cssci import ParseCssci
 
 test_clean_data = [
-    (
-        "1.康德.纯粹理性批判.北京:人民出版社",
-        "康德.纯粹理性批判.北京:人民出版社",
-    ),
     (
         "7..中华人民共和国公共图书馆法.2021",
         ".中华人民共和国公共图书馆法.2021",
     ),
+    (
+        "1.段美珍.智慧图书馆的内涵特点及其认知模型研究.图书情报工作.",
+        "段美珍.智慧图书馆的内涵特点及其认知模型研究.图书情报工作",
+    ),
+    (
+        "3.郑晋鸣.南京城东五高校建图书馆联合体.光明日报.04.24(7)",
+        "郑晋鸣.南京城东五高校建图书馆联合体.光明日报",
+    ),
 ]
 
-test_web_data = [
+test_drop_data = [
     (
-        "8.Google.Analytics.js.2021",
-        {
-            "type": "web",
-            "author": "Google",
-            "title": "Analytics.js",
-            "year": "2021",
-        },
+        "1.17 U. S. C. § 107",
+        None,
     ),
     (
-        "9..CNNIC:微博用户达2.5亿，近半数网民使用.2012",
-        {
-            "type": "web",
-            "author": None,
-            "title": "CNNIC:微博用户达25亿，近半数网民使用2012",
-            "year": "2012",
-        },
+        "1..",
+        None,
     ),
     (
-        "22.IFLA.IFLA STRATEGY 2019-2024.2019",
-        {
-            "type": "web",
-            "author": "IFLA",
-            "title": "IFLA STRATEGY 2019-2024",
-            "year": "2019",
-        },
+        "2..Campbell v. Acuff-Rose Music, Inc., 510 U. S. 569 (1994),1994",
+        None,
     ),
-]
-
-test_standard_data = [
     (
-        "8..GB/T37043-2018，智慧城市术语.北京:中国标准出版社",
-        {
-            "type": "standard",
-            "author": None,
-            "title": "智慧城市术语",
-            "source": "北京:中国标准出版社",
-            "year": None,
-            "identifier": "GB/T37043-2018",
-        },
+        "26.图书上下架机器人.CN102152293A",
+        None,
     ),
     (
-        "17.全国信息技术标准化技术委员会教育技术分会.GB/T 36342-2018,智慧校园总体框架,2018",
-        {
-            "type": "standard",
-            "author": "全国信息技术标准化技术委员会教育技术分会",
-            "title": "智慧校园总体框架",
-            "source": None,
-            "year": "2018",
-            "identifier": "GB/T 36342-2018",
-        },
+        "9.一种基于RFID技术的自动式图书智能盘点机器人:201620075212.0.2016-01-25",
+        None,
     ),
     (
-        "30.全国信息技术标准化技术委员会.智慧城市，数据融合，第5部分:市政基础设施数据元素:GB/T 36625.5-2019.北京:中国标准出版社",
-        {
-            "type": "standard",
-            "author": "全国信息技术标准化技术委员会",
-            "title": "智慧城市，数据融合，第5部分:市政基础设施数据元素",
-            "source": "北京:中国标准出版社",
-            "year": None,
-            "identifier": "GB/T 36625.5-2019",
-        },
+        "19.皮亚杰.儿童心理学.北京:商务印书馆",
+        "19.皮亚杰.儿童心理学.北京:商务印书馆",
     ),
 ]
 
-test_book_data = [
+test_author_data = [
     (
-        "14.吴建中.21世纪图书馆新论.上海:上海科学技术文献出版社",
-        {
-            "type": "book",
-            "author": "吴建中",
-            "title": "21世纪图书馆新论",
-            "source": "上海:上海科学技术文献出版社",
-        },
+        "25.Remy,M..Information Literacy: The Information Commons Connection.California,2004",
+        "Remy,M.",
     ),
     (
-        "3.金元浦.中国文化概论.北京:首都师范大学出版社",
-        {
-            "type": "book",
-            "author": "金元浦",
-            "title": "中国文化概论",
-            "source": "北京:首都师范大学出版社",
-        },
+        "1..2021年度江苏省公共图书馆大数据统计报告",
+        None,
+    ),
+    (
+        "14.陈大庆.FOLIO在深圳大学,2018",
+        "陈大庆",
     ),
 ]
 
-test_thesis_data = [
+test_parse_data = [
     (
-        "21.郑怿昕.智慧图书馆环境下馆员核心能力研究:学位论文.南京:南京农业大学,2015:27-31",
+        "5.北京市第一中级人民法院民事判决书(2011)一中民初字第1321号",
         {
-            "type": "thesis",
-            "author": "郑怿昕",
-            "title": "智慧图书馆环境下馆员核心能力研究",
-            "source": "南京:南京农业大学",
-            "year": "2015",
+            "author": None,
+            "title": "北京市第一中级人民法院民事判决书(2011)一中民初字第1321号",
+            "source": None,
+            "year": None,
+            "volume": None,
+            "issue": None,
         },
     ),
     (
-        "17.段美珍.智慧图书馆建设评价模型与应用研究:学位论文.北京:中国科学院大学,2020",
+        "10..GB/T 35273-2020，信息安全技术个人信息安全规范",
         {
-            "type": "thesis",
-            "author": "段美珍",
-            "title": "智慧图书馆建设评价模型与应用研究",
-            "source": "北京:中国科学院大学",
-            "year": "2020",
+            "author": None,
+            "title": "GB/T 35273-2020，信息安全技术个人信息安全规范",
+            "source": None,
+            "year": None,
+            "volume": None,
+            "issue": None,
+            "page": None,
         },
     ),
-]
-
-test_newspaper_data = [
     (
         "6..习近平在第二届世界互联网大会开幕式上的讲话.人民日报.12.17(2)",
         {
-            "type": "newspaper",
             "author": None,
             "title": "习近平在第二届世界互联网大会开幕式上的讲话",
             "source": "人民日报",
-            "date": "12.17",
+            "year": None,
+            "volume": None,
+            "issue": None,
+            "page": None,
         },
     ),
     (
-        "25.曹磊.大数据:数字世界的智慧基因.文汇报.11.8(12)",
+        "21.郑怿昕.智慧图书馆环境下馆员核心能力研究:学位论文.南京:南京农业大学,2015:27-31",
         {
-            "type": "newspaper",
-            "author": "曹磊",
-            "title": "大数据:数字世界的智慧基因",
-            "source": "文汇报",
-            "date": "11.8",
+            "author": "郑怿昕",
+            "title": "智慧图书馆环境下馆员核心能力研究:学位论文",
+            "source": "南京:南京农业大学",
+            "year": "2015",
+            "volume": None,
+            "issue": None,
+            "page": "27-31",
         },
     ),
     (
-        "65..图书馆来了机器人管理员.宁波日报.1.8",
+        "9..CNNIC:微博用户达2.5亿，近半数网民使用.2012",
         {
-            "type": "newspaper",
             "author": None,
-            "title": "图书馆来了机器人管理员",
-            "source": "宁波日报",
-            "date": "1.8",
+            "title": "CNNIC:微博用户达2.5亿，近半数网民使用",
+            "source": None,
+            "year": "2012",
+            "volume": None,
+            "issue": None,
+            "page": None,
         },
     ),
-]
-
-test_patent1_data = [
     (
-        "26.图书上下架机器人.CN102152293A",
+        "4..Society 5.0——科学技术政策——内阁府.2020",
         {
-            "type": "patent",
-            "title": "图书上下架机器人",
-            "identifier": "CN102152293A",
+            "author": None,
+            "title": "Society 5.0——科学技术政策——内阁府",
+            "source": None,
+            "year": "2020",
+            "volume": None,
+            "issue": None,
+            "page": None,
         },
-    )
-]
-test_patent2_data = [
+    ),
     (
-        "9.一种基于RFID技术的自动式图书智能盘点机器人:201620075212.0.2016-01-25",
+        "5.杨新涯.2.0的图书馆.广州:中山大学出版社",
         {
-            "type": "patent",
-            "title": "一种基于RFID技术的自动式图书智能盘点机器人",
-            "identifier": "201620075212.0",
+            "author": "杨新涯",
+            "title": "2.0的图书馆",
+            "source": "广州:中山大学出版社",
+            "year": None,
+            "volume": None,
+            "issue": None,
+            "page": None,
         },
     ),
     (
-        "39.一种基于区块链的金融安全存证平台系统及方法:中国，201910838935. X(2019-09-05)",
+        "9.全国人民代表大会常务委员会.中华人民共和国个人信息保护法,2021",
         {
-            "type": "patent",
-            "title": "一种基于区块链的金融安全存证平台系统及方法",
-            "identifier": "201910838935",
+            "author": "全国人民代表大会常务委员会",
+            "title": "中华人民共和国个人信息保护法",
+            "source": None,
+            "year": "2021",
+            "volume": None,
+            "issue": None,
+            "page": None,
         },
     ),
-]
-
-
-test_paper_data = [
     (
-        "2.严栋.基于物联网的智慧图书馆.图书馆学刊.2010.32(7)",
+        "1.马费成.图书情报学与元宇宙:共识共创共进",
         {
-            "type": "paper",
-            "author": "严栋",
-            "title": "基于物联网的智慧图书馆",
-            "source": "图书馆学刊",
-            "year": "2010",
-            "volume": "32",
-            "issue": "7",
+            "author": "马费成",
+            "title": "图书情报学与元宇宙:共识共创共进",
+            "source": None,
+            "year": None,
+            "volume": None,
+            "issue": None,
+            "page": None,
         },
     ),
     (
         "39.刘炜.5G与智慧图书馆建设.中国图书馆学报.2019.45(5)",
         {
-            "type": "paper",
             "author": "刘炜",
             "title": "5G与智慧图书馆建设",
             "source": "中国图书馆学报",
             "year": "2019",
             "volume": "45",
             "issue": "5",
-        },
-    ),
-]
-
-test_english_data = [
-    (
-        "20.Alexei,P..Rite of passage.USA:Galaxy Publishing Co",
-        {
-            "type": "english-book",
-            "author": "Alexei,P.",
-            "title": "Rite of passage",
-            "source": "USA:Galaxy Publishing Co",
-            "year": None,
             "page": None,
         },
     ),
-    (
-        "8.Sohail,S S.Book recommendation system using opinion mining technique:IEEE,2013:1609-1614",
-        {
-            "type": "english-book",
-            "author": "Sohail,S S",
-            "title": "Book recommendation system using opinion mining technique",
-            "source": "IEEE",
-            "year": "2013",
-            "page": "1609-1614",
-        },
-    ),
     (
         "7.Vaz,P C.Improving a hybrid literary book recommendation system through author ranking.New York:Association for Computing Machinery,2012:387-388",
         {
-            "type": "english-book",
             "author": "Vaz,P C",
             "title": "Improving a hybrid literary book recommendation system through author ranking",
             "source": "New York:Association for Computing Machinery",
             "year": "2012",
+            "volume": None,
+            "issue": None,
             "page": "387-388",
         },
     ),
     (
-        "7.Hunzaker,M.B. Fallin.Mapping Cultural Schemas: From Theory to Method.American Sociological Review.2019.84(5)",
+        "22.IFLA.IFLA STRATEGY 2019-2024.2019",
         {
-            "type": "english-paper",
-            "author": "Hunzaker,M.B. Fallin",
-            "title": "Mapping Cultural Schemas: From Theory to Method",
-            "source": "American Sociological Review",
+            "author": "IFLA",
+            "title": "IFLA STRATEGY 2019-2024",
+            "source": None,
             "year": "2019",
-            "volume": "84",
-            "issue": "5",
-        },
-    ),
-    (
-        "1.Aittola,M..Smart Library: Location-Aware Mobile Library Service.International Symposium on Human Computer Interaction with Mobile Devices and Services.2003.(5)",
-        {
-            "type": "english-paper",
-            "author": "Aittola,M.",
-            "title": "Smart Library: Location-Aware Mobile Library Service",
-            "source": "International Symposium on Human Computer Interaction with Mobile Devices and Services",
-            "year": "2003",
             "volume": None,
-            "issue": "5",
+            "issue": None,
+            "page": None,
         },
     ),
-]
-
-test_parse_data = [
     (
-        "3..2021第五届中国未来智慧图书馆发展论坛.2021",
+        "1.Lu,Y..Digital Twin-driven smart manufacturing: Connotation, reference model, applications and research issues.Robotics and Computer-Integrated Manufacturing.2020.61",
         {
-            "type": "web",
-            "author": None,
-            "title": "2021第五届中国未来智慧图书馆发展论坛",
-            "year": "2021",
+            "author": "Lu,Y.",
+            "title": "Digital Twin-driven smart manufacturing: Connotation, reference model, applications and research issues",
+            "source": "Robotics and Computer-Integrated Manufacturing",
+            "year": "2020",
+            "volume": "61",
+            "issue": None,
+            "page": None,
         },
     ),
     (
-        "10..GB/T 35273-2020，信息安全技术个人信息安全规范",
+        "14.Hufflen,J M.Languages for Bibliography Styles.TUGB",
         {
-            "type": "standard",
-            "author": None,
-            "title": "信息安全技术个人信息安全规范",
-            "source": None,
+            "author": "Hufflen,J M",
+            "title": "Languages for Bibliography Styles",
+            "source": "TUGB",
             "year": None,
-            "identifier": "GB/T 35273-2020",
-        },
-    ),
-    (
-        "14.吴慰慈.图书馆学概论.北京:北京图书馆出版社",
-        {
-            "type": "book",
-            "author": "吴慰慈",
-            "title": "图书馆学概论",
-            "source": "北京:北京图书馆出版社",
-        },
-    ),
-    (
-        "37.潘星.智慧图书馆联盟建设策略研究:学位论文.扬州:扬州大学,2021",
-        {
-            "type": "thesis",
-            "author": "潘星",
-            "title": "智慧图书馆联盟建设策略研究",
-            "source": "扬州:扬州大学",
-            "year": "2021",
-        },
-    ),
-    (
-        "12.王伟健.一个来了还想再来的图书馆.人民日报.1.7(11)",
-        {
-            "type": "newspaper",
-            "author": "王伟健",
-            "title": "一个来了还想再来的图书馆",
-            "source": "人民日报",
-            "date": "1.7",
-        },
-    ),
-    (
-        "25.一种用于图书馆机器人的书本上架装置.CN202880264U",
-        {
-            "type": "patent",
-            "title": "一种用于图书馆机器人的书本上架装置",
-            "identifier": "CN202880264U",
-        },
-    ),
-    (
-        "10.一种基于RFID标签RSSI信号值的图书排序方法:201610050963.1.2016-01-25",
-        {
-            "type": "patent",
-            "title": "一种基于RFID标签RSSI信号值的图书排序方法",
-            "identifier": "201610050963.1",
-        },
-    ),
-    (
-        "33.司莉.科学数据的标准规范体系框架研究.图书馆.2016.(5)",
-        {
-            "type": "paper",
-            "author": "司莉",
-            "title": "科学数据的标准规范体系框架研究",
-            "source": "图书馆",
-            "year": "2016",
             "volume": None,
-            "issue": "5",
-        },
-    ),
-    (
-        "18.Ahirwar,J..Five Laws of Library Science and Information Economics.Informatics Studies.2021.7(1)",
-        {
-            "type": "english-paper",
-            "author": "Ahirwar,J.",
-            "title": "Five Laws of Library Science and Information Economics",
-            "source": "Informatics Studies",
-            "year": "2021",
-            "volume": "7",
-            "issue": "1",
+            "issue": None,
+            "page": None,
         },
     ),
 ]
@@ -371,49 +235,15 @@ def test_clean(input, expected):
     assert ParseCssci.clean(input) == expected
 
 
-@pytest.mark.parametrize("input, expected", test_web_data)
-def test_parse_web(input, expected):
-    assert ParseCssci(input).parse_web() == expected
-
-
-@pytest.mark.parametrize("input, expected", test_standard_data)
-def test_parse_standard(input, expected):
-    assert ParseCssci(input).parse_standard() == expected
-
-
-@pytest.mark.parametrize("input, expected", test_book_data)
-def test_parse_book(input, expected):
-    assert ParseCssci(input).parse_book() == expected
-
-
-@pytest.mark.parametrize("input, expected", test_thesis_data)
-def test_parse_thesis(input, expected):
-    assert ParseCssci(input).parse_thesis() == expected
-
-
-@pytest.mark.parametrize("input, expected", test_newspaper_data)
-def test_parse_newspaper(input, expected):
-    assert ParseCssci(input).parse_newspaper() == expected
-
-
-@pytest.mark.parametrize("input, expected", test_patent1_data)
-def test_parse_patent1(input, expected):
-    assert ParseCssci(input).parse_patent1() == expected
-
-
-@pytest.mark.parametrize("input, expected", test_patent2_data)
-def test_parse_patent2(input, expected):
-    assert ParseCssci(input).parse_patent2() == expected
-
-
-@pytest.mark.parametrize("input, expected", test_paper_data)
-def test_parse_paper(input, expected):
-    assert ParseCssci(input).parse_paper() == expected
+@pytest.mark.parametrize("input, expected", test_drop_data)
+def test_drop(input, expected):
+    assert ParseCssci.drop(input) == expected
 
 
-@pytest.mark.parametrize("input, expected", test_english_data)
-def test_parse_english(input, expected):
-    assert ParseCssci(input).parse_english() == expected
+@pytest.mark.parametrize("input, expected", test_author_data)
+def test_extract_author(input, expected):
+    ref = ParseCssci.clean(input)
+    assert ParseCssci(ref).extract_author() == expected
 
 
 @pytest.mark.parametrize("input, expected", test_parse_data)