# ----------------------------------------------------- # Summary of all used filtering rules: # ----------------------------------------------------- 1) case insensitive patterns (perl regular expressions) # ----------------------------------------------------- # name looks like an aminoacid ^(ala(nine)?|cys(teine)?|asp(aragine)?|glu(tamine)?|phe(nylalanine)?|gly(cine)?|his(tidine)?|ile|isoleucine|lys(ine)?|leu(cine)?|met(hionine)?|asn|aspartic acid|aspartate|pro(line)?|gln|glutamic acid|glutamate|arg(inine)?|ser(ine)?|thr(eonine)?|val(ine)?|trp|tryptophane|tyr(osine)?)(-| )?\(?\d+\)?$ # type/class/greek-letter not enough for a full name ^(type|beta|alpha|gamma|eps[iy]lon|zeta|kappa|class)( |-)(\d+|[ivx]+)$ # kDa pattern ^[\d.]+[ \-]?k(Da?|b(yte)?)$ # name contains punktuation characters (^| )[.,!?:;]( |$) # name contains too many dots \.[^.]+\.[^.]+\. # name contains 'cell-type' key-words at the end (^| |-)(cells?|\w+cytes?)$ # name contains number plus plural word ^([\d.,]+|two|three|four|fife|six|seven|eight|nine|ten|eleven|twelve) \w+[^s]s$ # name contains 'cell' at the beginning plus only one following word (like cell type or cell division) ^(cells?)( |-)\w+$ # word matches no letters ^[^a-z]+$ # too many consecutive digits \d{5} # name contains rare characters (#=;%°@$?!) [=%;#@$?!]|°.?[CF] # name seems to be a physical unit (\/|^\d+ ?)(cm2|[nm]?mol|ml|kg|l|h|mg|g|min)( |$) # name seems to be an author's name \bet( \.)? al\b 2) case sensitive patterns (perl regular expressions) # ----------------------------------------------------- # mutations ^([ACDEFGHIKLMNPQRSTVWY]|Ala(nine)?|Cys(teine)?|Asp(aragine)?|Glu(tamine)?|Phe(nylalanine)?|Gly(cine)?|His(tidine)?|Ile|Isoleucine|Lys(ine)?|Leu(cine)?|Met(hionine)?|Asn|Aspartic acid|Aspartate|Pro(line)?|Gln|Glutamic acid|Glutamate|Arg(inine)?|Ser(ine)?|Thr(eonine)?|Val(ine)?|Trp|Tryptophane|Tyr(osine)?)\d{2,4}([ACDEFGHIKLMNPQRSTVWY]|Ala(nine)?|Cys(teine)?|Asp(aragine)?|Glu(tamine)?|Phe(nylalanine)?|Gly(cine)?|His(tidine)?|Ile|Isoleucine|Lys(ine)?|Leu(cine)?|Met(hionine)?|Asn|Aspartic acid|Aspartate|Pro(line)?|Gln|Glutamic acid|Glutamate|Arg(inine)?|Ser(ine)?|Thr(eonine)?|Val(ine)?|Trp|Tryptophane|Tyr(osine)?)$ # aminoacids ^(Ala|Cys|Asp|Glu|Phe|Gly|His|Ile|Lys|Leu|Met|Asn|Pro|Gln|Arg|Ser|Thr|Val|Trp|Tyr)$ # aminoacids plus position ^[ACDEFGHIKLMNPQRSTVWY]\d{3}$ # name seems to be an internet address (^(ht|f)tp://)|(\.html?)|(\.(com|de|uk|il|it|gov|edu|es|fr|nl|dk|ru|jp|ca|net)$) # name looks like a persons name ^[A-Z]\. ?[A-Z][a-z]{4,}$ # name looks like a dna-sequence ^[AGTC]{5,}$ # name looks like an rna-sequence ^[AGUC]{5,}$ # name looks like an rna/dna-type ^([a-z]+[RD]NAs?|RNAi)$ # name seems to be a physical unit (2) ^(pH|pK[asb]?|pI)( =? ?[\d.]+)?$ # name seems to be nucleotide ^(nt[- ]?\d+|d?[AGCTU][TDM]P|N[TDM]Ps?)$ name looks like a nucleotide 3) complex rules # ----------------------------------------------------- # the last four letters of the sample centre are in the 'chemical compound endings database' # sample centre preceeds 'cell' or 'cells' # sample centre suceeds 'virus' or 'virus type \d+' # sample centre is only one token long and in common dictionary # sample centre is exactly two tokens long where first word is a number => cannot to end in the following pattern: (protein|gene|factor|hormone|homolog(ue)?|collagen|pump|antibody|precursor|molecule|isoform|receptor|regulator|proteasome|allele|peptide|keratin|cytokine|chemokine|activator|transporter|ribozyme|antigen|translocator|subunit|repressor|receptor|channel|inhibitor|enzyme|ase|in)$ # sample centre preceeds '=' (seems to be part of equation) # sample centre has number as last token and is followed by a plural noun