Skip to content

Snippet: Checking for dead links to Phosphosite Plus

krassowski edited this page Jun 16, 2017 · 2 revisions
import sys                                                                                                                                            
import requests
from requests.exceptions import ConnectTimeout, ReadTimeout
from tqdm import tqdm
from app import create_app
from models import Protein

template = 'http://www.phosphosite.org/uniprotAccAction?id={accession}'

create_app(config_override={'LOAD_STATS': False})


def request(url, times=5):
    try:
        r = requests.get(
            url,
            timeout=1
        )

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        return r

    except (ConnectTimeout, ReadTimeout):
        if times:
            return request(url, times=times-1)


def main():

    tested = 0
    skipped = 0
    not_founds = []
    not_found_and_reviewed = []
    no_externals = []
    no_uniprot = []

    for protein in tqdm(Protein.query.all()):
        prot_rep = (
            protein.gene.name,
            protein.refseq
        )
        if not protein.external_references:
            no_externals.append(prot_rep)
            continue
        if not protein.external_references.uniprot_entries:
            no_uniprot.append(prot_rep)
            continue

        for uniprot in protein.external_references.uniprot_entries:
            url = template.format(accession=uniprot.accession)

            r = request(url)

            if not r:
                print('Unable to connect for %s (%s)' % prot_rep)
                skipped += 1
                continue

            if 'No records found that match the search criteria' in r.text:
                not_founds.append((prot_rep[0], prot_rep[1], uniprot.accession))
                if uniprot.reviewed:
                    not_found_and_reviewed.append((prot_rep[0], prot_rep[1], uniprot.accession))
                    print('Reviewed but not found: %s (%s)' % prot_rep)
                    print(url)
                
            tested += 1

    print('Tested %s' % tested)
    print('Skipped %s' % skipped)
    return {
        'not_found': not_founds,
        'no_uniprot': no_uniprot,
        'no_externals': no_externals,
        'not_found_and_reviewed': not_found_and_reviewed,        
    }


results = main()