001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.validator;
018
019 import java.io.Serializable;
020 import java.util.Arrays;
021 import java.util.HashSet;
022 import java.util.Set;
023
024 import org.apache.commons.validator.routines.InetAddressValidator;
025 import org.apache.commons.validator.util.Flags;
026 import org.apache.oro.text.perl.Perl5Util;
027
028 /**
029 * <p>Validates URLs.</p>
030 * Behavour of validation is modified by passing in options:
031 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path
032 * component.</li>
033 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is
034 * included then fragments are flagged as illegal.</li>
035 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are
036 * considered valid schemes. Enabling this option will let any scheme pass validation.</li>
037 *
038 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
039 * http://javascript.internet.com. However, this validation now bears little resemblance
040 * to the php original.</p>
041 * <pre>
042 * Example of usage:
043 * Construct a UrlValidator with valid schemes of "http", and "https".
044 *
045 * String[] schemes = {"http","https"}.
046 * UrlValidator urlValidator = new UrlValidator(schemes);
047 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
048 * System.out.println("url is valid");
049 * } else {
050 * System.out.println("url is invalid");
051 * }
052 *
053 * prints "url is invalid"
054 * If instead the default constructor is used.
055 *
056 * UrlValidator urlValidator = new UrlValidator();
057 * if (urlValidator.isValid("ftp://foo.bar.com/")) {
058 * System.out.println("url is valid");
059 * } else {
060 * System.out.println("url is invalid");
061 * }
062 *
063 * prints out "url is valid"
064 * </pre>
065 *
066 * @see
067 * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
068 * Uniform Resource Identifiers (URI): Generic Syntax
069 * </a>
070 *
071 * @version $Revision: 588054 $ $Date: 2007-10-25 00:12:12 +0200 (Do, 25. Okt 2007) $
072 * @since Validator 1.1
073 * @deprecated Use the new UrlValidator in the routines package. This class
074 * will be removed in a future release.
075 */
076 public class UrlValidator implements Serializable {
077
078 /**
079 * Allows all validly formatted schemes to pass validation instead of
080 * supplying a set of valid schemes.
081 */
082 public static final int ALLOW_ALL_SCHEMES = 1 << 0;
083
084 /**
085 * Allow two slashes in the path component of the URL.
086 */
087 public static final int ALLOW_2_SLASHES = 1 << 1;
088
089 /**
090 * Enabling this options disallows any URL fragments.
091 */
092 public static final int NO_FRAGMENTS = 1 << 2;
093
094 private static final String ALPHA_CHARS = "a-zA-Z";
095
096 private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
097
098 private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
099
100 private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
101
102 private static final String SCHEME_CHARS = ALPHA_CHARS;
103
104 // Drop numeric, and "+-." for now
105 private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
106
107 private static final String ATOM = VALID_CHARS + '+';
108
109 /**
110 * This expression derived/taken from the BNF for URI (RFC2396).
111 */
112 private static final String URL_PATTERN =
113 "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?/";
114 // 12 3 4 5 6 7 8 9
115
116 /**
117 * Schema/Protocol (ie. http:, ftp:, file:, etc).
118 */
119 private static final int PARSE_URL_SCHEME = 2;
120
121 /**
122 * Includes hostname/ip and port number.
123 */
124 private static final int PARSE_URL_AUTHORITY = 4;
125
126 private static final int PARSE_URL_PATH = 5;
127
128 private static final int PARSE_URL_QUERY = 7;
129
130 private static final int PARSE_URL_FRAGMENT = 9;
131
132 /**
133 * Protocol (ie. http:, ftp:,https:).
134 */
135 private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS + "]/";
136
137 private static final String AUTHORITY_PATTERN =
138 "/^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?/";
139 // 1 2 3 4
140
141 private static final int PARSE_AUTHORITY_HOST_IP = 1;
142
143 private static final int PARSE_AUTHORITY_PORT = 2;
144
145 /**
146 * Should always be empty.
147 */
148 private static final int PARSE_AUTHORITY_EXTRA = 3;
149
150 private static final String PATH_PATTERN = "/^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$/";
151
152 private static final String QUERY_PATTERN = "/^(.*)$/";
153
154 private static final String LEGAL_ASCII_PATTERN = "/^[\\000-\\177]+$/";
155
156 private static final String DOMAIN_PATTERN =
157 "/^" + ATOM + "(\\." + ATOM + ")*$/";
158
159 private static final String PORT_PATTERN = "/^:(\\d{1,5})$/";
160
161 private static final String ATOM_PATTERN = "/(" + ATOM + ")/";
162
163 private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS + "]/";
164
165 /**
166 * Holds the set of current validation options.
167 */
168 private Flags options = null;
169
170 /**
171 * The set of schemes that are allowed to be in a URL.
172 */
173 private Set allowedSchemes = new HashSet();
174
175 /**
176 * If no schemes are provided, default to this set.
177 */
178 protected String[] defaultSchemes = {"http", "https", "ftp"};
179
180 /**
181 * Create a UrlValidator with default properties.
182 */
183 public UrlValidator() {
184 this(null);
185 }
186
187 /**
188 * Behavior of validation is modified by passing in several strings options:
189 * @param schemes Pass in one or more url schemes to consider valid, passing in
190 * a null will default to "http,https,ftp" being valid.
191 * If a non-null schemes is specified then all valid schemes must
192 * be specified. Setting the ALLOW_ALL_SCHEMES option will
193 * ignore the contents of schemes.
194 */
195 public UrlValidator(String[] schemes) {
196 this(schemes, 0);
197 }
198
199 /**
200 * Initialize a UrlValidator with the given validation options.
201 * @param options The options should be set using the public constants declared in
202 * this class. To set multiple options you simply add them together. For example,
203 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
204 */
205 public UrlValidator(int options) {
206 this(null, options);
207 }
208
209 /**
210 * Behavour of validation is modified by passing in options:
211 * @param schemes The set of valid schemes.
212 * @param options The options should be set using the public constants declared in
213 * this class. To set multiple options you simply add them together. For example,
214 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options.
215 */
216 public UrlValidator(String[] schemes, int options) {
217 this.options = new Flags(options);
218
219 if (this.options.isOn(ALLOW_ALL_SCHEMES)) {
220 return;
221 }
222
223 if (schemes == null) {
224 schemes = this.defaultSchemes;
225 }
226
227 this.allowedSchemes.addAll(Arrays.asList(schemes));
228 }
229
230 /**
231 * <p>Checks if a field has a valid url address.</p>
232 *
233 * @param value The value validation is being performed on. A <code>null</code>
234 * value is considered invalid.
235 * @return true if the url is valid.
236 */
237 public boolean isValid(String value) {
238 if (value == null) {
239 return false;
240 }
241
242 Perl5Util matchUrlPat = new Perl5Util();
243 Perl5Util matchAsciiPat = new Perl5Util();
244
245 if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) {
246 return false;
247 }
248
249 // Check the whole url address structure
250 if (!matchUrlPat.match(URL_PATTERN, value)) {
251 return false;
252 }
253
254 if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) {
255 return false;
256 }
257
258 if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) {
259 return false;
260 }
261
262 if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) {
263 return false;
264 }
265
266 if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) {
267 return false;
268 }
269
270 if (!isValidFragment(matchUrlPat.group(PARSE_URL_FRAGMENT))) {
271 return false;
272 }
273
274 return true;
275 }
276
277 /**
278 * Validate scheme. If schemes[] was initialized to a non null,
279 * then only those scheme's are allowed. Note this is slightly different
280 * than for the constructor.
281 * @param scheme The scheme to validate. A <code>null</code> value is considered
282 * invalid.
283 * @return true if valid.
284 */
285 protected boolean isValidScheme(String scheme) {
286 if (scheme == null) {
287 return false;
288 }
289
290 Perl5Util schemeMatcher = new Perl5Util();
291 if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) {
292 return false;
293 }
294
295 if (this.options.isOff(ALLOW_ALL_SCHEMES)) {
296
297 if (!this.allowedSchemes.contains(scheme)) {
298 return false;
299 }
300 }
301
302 return true;
303 }
304
305 /**
306 * Returns true if the authority is properly formatted. An authority is the combination
307 * of hostname and port. A <code>null</code> authority value is considered invalid.
308 * @param authority Authority value to validate.
309 * @return true if authority (hostname and port) is valid.
310 */
311 protected boolean isValidAuthority(String authority) {
312 if (authority == null) {
313 return false;
314 }
315
316 Perl5Util authorityMatcher = new Perl5Util();
317 InetAddressValidator inetAddressValidator =
318 InetAddressValidator.getInstance();
319
320 if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) {
321 return false;
322 }
323
324 boolean hostname = false;
325 // check if authority is IP address or hostname
326 String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
327 boolean ipV4Address = inetAddressValidator.isValid(hostIP);
328
329 if (!ipV4Address) {
330 // Domain is hostname name
331 Perl5Util domainMatcher = new Perl5Util();
332 hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP);
333 }
334
335 //rightmost hostname will never start with a digit.
336 if (hostname) {
337 // LOW-TECH FIX FOR VALIDATOR-202
338 // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
339 char[] chars = hostIP.toCharArray();
340 int size = 1;
341 for(int i=0; i<chars.length; i++) {
342 if(chars[i] == '.') {
343 size++;
344 }
345 }
346 String[] domainSegment = new String[size];
347 boolean match = true;
348 int segmentCount = 0;
349 int segmentLength = 0;
350 Perl5Util atomMatcher = new Perl5Util();
351
352 while (match) {
353 match = atomMatcher.match(ATOM_PATTERN, hostIP);
354 if (match) {
355 domainSegment[segmentCount] = atomMatcher.group(1);
356 segmentLength = domainSegment[segmentCount].length() + 1;
357 hostIP =
358 (segmentLength >= hostIP.length())
359 ? ""
360 : hostIP.substring(segmentLength);
361
362 segmentCount++;
363 }
364 }
365 String topLevel = domainSegment[segmentCount - 1];
366 if (topLevel.length() < 2 || topLevel.length() > 4) {
367 return false;
368 }
369
370 // First letter of top level must be a alpha
371 Perl5Util alphaMatcher = new Perl5Util();
372 if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(0, 1))) {
373 return false;
374 }
375
376 // Make sure there's a host name preceding the authority.
377 if (segmentCount < 2) {
378 return false;
379 }
380 }
381
382 if (!hostname && !ipV4Address) {
383 return false;
384 }
385
386 String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
387 if (port != null) {
388 Perl5Util portMatcher = new Perl5Util();
389 if (!portMatcher.match(PORT_PATTERN, port)) {
390 return false;
391 }
392 }
393
394 String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
395 if (!GenericValidator.isBlankOrNull(extra)) {
396 return false;
397 }
398
399 return true;
400 }
401
402 /**
403 * Returns true if the path is valid. A <code>null</code> value is considered invalid.
404 * @param path Path value to validate.
405 * @return true if path is valid.
406 */
407 protected boolean isValidPath(String path) {
408 if (path == null) {
409 return false;
410 }
411
412 Perl5Util pathMatcher = new Perl5Util();
413
414 if (!pathMatcher.match(PATH_PATTERN, path)) {
415 return false;
416 }
417
418 int slash2Count = countToken("//", path);
419 if (this.options.isOff(ALLOW_2_SLASHES) && (slash2Count > 0)) {
420 return false;
421 }
422
423 int slashCount = countToken("/", path);
424 int dot2Count = countToken("..", path);
425 if (dot2Count > 0) {
426 if ((slashCount - slash2Count - 1) <= dot2Count) {
427 return false;
428 }
429 }
430
431 return true;
432 }
433
434 /**
435 * Returns true if the query is null or it's a properly formatted query string.
436 * @param query Query value to validate.
437 * @return true if query is valid.
438 */
439 protected boolean isValidQuery(String query) {
440 if (query == null) {
441 return true;
442 }
443
444 Perl5Util queryMatcher = new Perl5Util();
445 return queryMatcher.match(QUERY_PATTERN, query);
446 }
447
448 /**
449 * Returns true if the given fragment is null or fragments are allowed.
450 * @param fragment Fragment value to validate.
451 * @return true if fragment is valid.
452 */
453 protected boolean isValidFragment(String fragment) {
454 if (fragment == null) {
455 return true;
456 }
457
458 return this.options.isOff(NO_FRAGMENTS);
459 }
460
461 /**
462 * Returns the number of times the token appears in the target.
463 * @param token Token value to be counted.
464 * @param target Target value to count tokens in.
465 * @return the number of tokens.
466 */
467 protected int countToken(String token, String target) {
468 int tokenIndex = 0;
469 int count = 0;
470 while (tokenIndex != -1) {
471 tokenIndex = target.indexOf(token, tokenIndex);
472 if (tokenIndex > -1) {
473 tokenIndex++;
474 count++;
475 }
476 }
477 return count;
478 }
479 }